pax_global_header00006660000000000000000000000064150334676620014526gustar00rootroot0000000000000052 comment=ee077035f13f917b3b74f44850f5768c5d13d892 3-3.11.1/000077500000000000000000000000001503346766200117535ustar00rootroot000000000000003-3.11.1/.github/000077500000000000000000000000001503346766200133135ustar00rootroot000000000000003-3.11.1/.github/ISSUE_TEMPLATE/000077500000000000000000000000001503346766200154765ustar00rootroot000000000000003-3.11.1/.github/ISSUE_TEMPLATE/1_bug_report.yml000066400000000000000000000010461503346766200206120ustar00rootroot00000000000000name: Bug report or feature request description: For issues concerning the mumax³ source code. body: - type: textarea id: body attributes: label: "Issue details" description: "Describe your issue below." value: | 3-3.11.1/.github/ISSUE_TEMPLATE/config.yml000066400000000000000000000005211503346766200174640ustar00rootroot00000000000000blank_issues_enabled: false contact_links: - name: "Ask general usage question on mumax³ mailing list" url: https://groups.google.com/g/mumax2 about: Please ask general usage questions on the mumax³ mailing list, where the whole mumax³ community can help you by answering your questions and/or learn from the given answers. 3-3.11.1/.gitignore000066400000000000000000000002211503346766200137360ustar00rootroot00000000000000*.swp *.swo *.5 *.6 *.8 *.o *.a *.log *.dump *.table *.gplot *.pprof mx3 *.tar.gz mumax3.*linux*cuda* *.*~ tmp/ *.out .idea/ .vscode/ doc/doc.exe3-3.11.1/.travis.yml000066400000000000000000000006141503346766200140650ustar00rootroot00000000000000language: go dist: xenial sudo: required install: true env: global: - GOARCH=amd64 before_install: - wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_9.2.88-1_amd64.deb - sudo dpkg -i cuda-repo-ubuntu1604_9.2.88-1_amd64.deb - sudo apt-get -qq update - sudo apt-get install cuda -y --allow-unauthenticated script: - go build ./...3-3.11.1/LICENSE000066400000000000000000000030001503346766200127510ustar00rootroot00000000000000Mumax3 GPU-accelerated micromagnetic simulator Copyright (C) 2012-2014 Arne Vansteenkiste. Contributions by Ahmad Syukri, Colin Jermain, Jonathan Leliaert, Mykola Dvornik. Mumax3 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Additional permission under GNU GPL version 3 section 7 If you modify this Program, or any covered work, by linking or combining it with NVIDIA Corporation's CUDA libraries from the NVIDIA CUDA Toolkit (or a modified version of those libraries), containing parts covered by the terms of NVIDIA CUDA Toolkit EULA, the licensors of this Program grant you additional permission to convey the resulting work. Mumax3 uses svgo (http://github.com/ajstarks/svgo), copyright Anthony Starks, licensed under the Creative Commons Attribution 3.0 license as described in http://creativecommons.org/licenses/by/3.0/us/ . Mumax3 uses freetype-go (http://code.google.com/p/freetype-go/), copyright Google Inc., Jeff R. Allen, Rémy Oudompheng, Roger Peppe, licensed under the FreeType License or the GNU General Public License (GPL), version 2 or later. Mumax3 uses CUDA libraries, copyright NVIDIA. 3-3.11.1/Makefile000066400000000000000000000020521503346766200134120ustar00rootroot00000000000000 # Use the default go compiler GO_BUILDFLAGS=-compiler gc # Or uncomment the line below to use the gccgo compiler, which may # or may not be faster than gc and which may or may not compile... # GO_BUILDFLAGS=-compiler gccgo -gccgoflags '-static-libgcc -O4 -Ofast -march=native' CGO_CFLAGS_ALLOW='(-fno-schedule-insns|-malign-double|-ffast-math)' .PHONY: all cudakernels clean realclean checktests runtests hooks all: cudakernels hooks go install -v $(GO_BUILDFLAGS) github.com/mumax/3/... cd cmd/mumax3/ && $(MAKE) cudakernels: cd cuda && $(MAKE) NVCC_CCBIN=$(NVCC_CCBIN) doc: cd doc && $(MAKE) test: all go test -vet=off -i github.com/mumax/3/... go test -vet=off $(PKGS) github.com/mumax/3/... cd test && ./run.bash hooks: .git/hooks/post-commit .git/hooks/pre-commit .git/hooks/post-commit: post-commit ln -sf $(CURDIR)/$< $@ .git/hooks/pre-commit: pre-commit ln -sf $(CURDIR)/$< $@ clean: rm -frv $(GOPATH)/pkg/*/github.com/mumax/3/* rm -frv $(GOPATH)/bin/mumax3* cd cuda && $(MAKE) clean realclean: clean cd cuda && ${MAKE} realclean3-3.11.1/README.md000066400000000000000000000361211503346766200132350ustar00rootroot00000000000000 # mumax³ **GPU-accelerated micromagnetism.** Paper on the design and verification of MuMax3: ## Downloads and documentation 👉 Pre-compiled binaries, examples, and documentation are available on the [mumax³ homepage](https://mumax.github.io). Documentation of several tools, like `mumax3-convert`, is available [here](https://godoc.org/github.com/mumax/3/cmd). ## Contributing Contributions are gratefully accepted. To contribute code, fork our GitHub repo and send a pull request. ## Building from source Consider downloading a [pre-compiled mumax³ binary](https://mumax.github.io/download.html). If you want to compile nevertheless, 4 essential components will be required to build mumax³: an ***NVIDIA driver***, ***Go***, ***CUDA*** and ***C***. * *If they are not yet present on your system*: install them as detailed below. * *If they are already installed*: check if they work correctly by running the *check* for each component written below. Click on the arrows below to expand the installation instructions:
These instructions were made for Windows 10 and Ubuntu 22.04 (but should be applicable to all Debian systems). Your mileage may vary.
Install an NVIDIA driver * **Windows**: Find a suitable driver [here](https://www.nvidia.com/en-us/drivers/). * **Linux**: [Install the NVIDIA proprietary driver](https://www.nvidia.com/en-us/drivers/unix/).
Troubleshooting Linux →click here← If the following error occurs, proceed as follows: ```batch nvidia-smi has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running ``` 1) Check for existing NVIDIA drivers. * Run `dpkg -l | grep nvidia` to see if any NVIDIA drivers are installed. * If it shows some drivers, you might want to uninstall them before proceeding with the clean installation: `sudo apt-get --purge remove '*nvidia*'` 2) Update system packages. Make sure your system is up to date with `sudo apt update` and `sudo apt upgrade`. 3) (Optional but recommended:) Add the official NVIDIA PPA to ensure you have access to the latest NVIDIA drivers with `sudo add-apt-repository ppa:graphics-drivers/ppa` and `sudo apt update`. 4) Install the recommended driver. Ubuntu can automatically detect and recommend the right NVIDIA driver for your system with the command `ubuntu-drivers devices`. This will list the available drivers for your GPU and mark the recommended one.
To install the recommended NVIDIA driver, use `sudo apt install nvidia-driver-` (replace `` with the number of the recommended driver e.g., nvidia-driver-535) 5) Reboot your system with `sudo reboot` to apply the changes. 6) Verify the installation with `nvidia-smi`. This returns something like this, which shows you the driver version in the top center: ```bash +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 552.22 Driver Version: 552.22 CUDA Version: 12.4 | |-----------------------------------------+------------------------+----------------------+ | GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 3080 ... WDDM | 00000000:01:00.0 Off | N/A | | N/A 53C P8 9W / 115W | 257MiB / 8192MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 28420 C+G ...Programs\Microsoft VS Code\Code.exe N/A | | 0 N/A N/A 31888 C+G ...les\Microsoft OneDrive\OneDrive.exe N/A | +-----------------------------------------------------------------------------------------+ ```
* **WSL**: Follow the instructions and troubleshooting for Linux above. If you encounter issues/errors during that process, see the troubleshooting section below:
Troubleshooting WSL →click here← When using Windows Subsystem for Linux, your graphics card might not be recognized. If an error occurs after running the command: 1) If `ubuntu-drivers devices` throws the error * `Command 'ubuntu-drivers' not found`: run the command `sudo apt install ubuntu-drivers-common`. * `ERROR:root:aplay command not found`: run the command `sudo apt install alsa-utils`. 2) If `sudo apt install nvidia-driver-` throws the error `E: Unable to locate package nvidia-driver-`: run the commands ```bash sudo apt install software-properties-gtk sudo add-apt-repository universe sudo add-apt-repository multiverse sudo apt update sudo apt install nvidia-driver- ``` 3) If `nvidia-smi` throws the error `nvidia: command not found`: the controller is probably not using the correct interface (`sudo lshw -c display` should show NVIDIA). To solve this, follow [these steps](https://learn.microsoft.com/en-us/windows/wsl/tutorials/gpu-compute). If a `docker: permission denied` error occurs: close and re-open WSL.
👉 *Check NVIDIA driver installation with: `nvidia-smi`*
Install CUDA - ⚠️Install in a directory without spaces⚠️ * **Windows**: Download an installer from [the CUDA website](https://developer.nvidia.com/cuda-downloads). * ⚠️ **To avoid common issues, the installation directory should not contain spaces. If possible, install in `C:\cuda`.** Spaces should not cause issues when running `deploy_windows.ps1`, but this is not guaranteed. * **Linux**: Use `sudo apt-get install nvidia-cuda-toolkit`, or [download an installer](https://developer.nvidia.com/cuda-downloads). * Pick the default installation path. **If this is not `usr/local/cuda/`, create a symlink to that path.** * Match the version shown in your driver (see top right in `nvidia-smi` output). * When prompted what to install: do not install the driver again, only the CUDA toolkit. * Add the CUDA `bin` and `lib64` paths to your `PATH` and `LD_LIBRARY_PATH` by adding the following lines at the end of your shell profile file (usually `.bashrc` for Bash): ```bash export PATH=/usr/local/cuda/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH ``` Apply the changes with `source ~/.bashrc`. 👉 *Check CUDA installation with: `nvcc --version`*
Install Go * Download and install from [the Go website](https://go.dev/doc/install). * The `GOPATH` environment variable should have been set automatically (note: the folder it points to probably doesn't exist yet).
*Check with `go env GOPATH`.*
Click here to set `GOPATH` manually if it does not exist. * On **Windows:** `%USERPROFILE%/go` is often used, e.g. `C:/Users//go`. See [this guide](https://www.wikihow.com/Change-the-PATH-Environment-Variable-on-Windows) if you are unfamiliar with environment variables. * On **Linux:** `~/go` is often used. Open or create the `~/.bashrc` file and add the following lines. ```bash export GOPATH=$HOME/go export PATH=$PATH:$GOPATH/bin ``` After editing the file, apply the changes by running `source ~/.bashrc`.
👉 *Check Go installation with: `go version`*
Install a C compiler * **Linux:** `sudo apt-get install gcc` * ⚠️ each CUDA version has a maximum supported `gcc` version. [This StackOverflow answer](https://stackoverflow.com/a/46380601) lists the maximum supported `gcc` version for each CUDA version. If necessary, use `sudo apt-get install gcc-` instead, with the appropriate ``. * **Windows:** * CUDA does not support the `gcc` compiler on Windows, so download and install [Visual Studio](https://visualstudio.microsoft.com/downloads/) with the C/C++ extension pack. After installing, check if the path to `cl.exe` was added to your `PATH` environment variable. If not, add it manually, e.g. `C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.29.30133\bin\HostX64\x64`. * To compile Go, on the other hand, `gcc` is needed. Usually this is included in the Go installation, but if not it can be downloaded and installed from [w64devkit](https://github.com/skeeto/w64devkit/releases). 👉 *Check C installation with: `gcc --version` on Linux and `where.exe cl.exe` on Windows.*
(Optional: install git to contribute to mumax³) If you don't have a GitHub profile yet, make one [here](https://github.com/join). * **Windows:** [Download](https://git-scm.com/downloads) and install. * **Linux:** `sudo apt install git` * [Set up your username in Git](https://docs.github.com/en/get-started/getting-started-with-git/setting-your-username-in-git) and [setup an SSH key for your GitHub account](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account). 👉 *Check Git installation with: `git –version`*
(Optional: install gnuplot for pretty graphs) * **Windows:** [Download]((http://www.gnuplot.info/download.html)) and install. * **Linux:** `sudo apt-get install gnuplot` 👉 *Check gnuplot installation with: `gnuplot -V`*
With these tools installed, you can build mumax³ yourself. * Within your `GOPATH` folder, create the subfolders `src/github.com/mumax`. * Clone the GitHub repository by running `git clone https://github.com/mumax/3.git` in that newly created `mumax` folder. * If you don't have git, you can manually fetch the source [here](https://github.com/mumax/3/releases) and unzip it into `$GOPATH/src/github.com/mumax/3`. * Initialize a Go module by moving to the newly created folder with `cd 3/` and running `go mod init github.com/mumax/3`, followed by `go mod tidy`. * Query the compute capability of your GPU using the command `nvidia-smi --query-gpu=compute_cap --format=csv`. Based on this, set the environment variable `CUDA_CC`: if your compute capability is e.g., 8.9, then set the value `CUDA_CC=89`. * You can now compile mumax³ ... * ... **on Linux:** ```bash make realclean make ``` Your binary is now at `$GOPATH/bin/mumax3`. Note: each CUDA version has a maximum supported GCC version. If your default GCC compiler is too recent, you can use a different GCC compiler by instead running `make NVCC_CCBIN=` where `` is a less recent GCC. [Check the version compatibility here](https://stackoverflow.com/a/46380601). Alternatively, setting the `NVCC_CCBIN` environment variable achieves the same thing, allowing you to run `make` as usual. * ... **on Windows:** The `Makefile`s may experience issues with whitespaces. Instead, we recommend to use the `deploy/deploy_windows.ps1` script: this generates the Windows executables for the [mumax³ download page](https://mumax.github.io/download.html), but can also be used to build a single mumax³ executable for yourself by making the following adjustments: 1) Change the `$VS2022` variable to point to your Visual Studio executable. If you wish to compile for CUDA versions below v11.6, also set `$VS2017`. Example: if `where.exe cl.exe` returns `foo\bar\cl.exe`, then set `$VS2022 = "foo\bar"`. 2) (Not strictly necessary, but check this anyway) Throughout the file there are several `switch ( $CUDA_VERSION )` blocks. If these do not address your installed CUDA version, add your version. Consult nearby comments when in doubt. Now you can compile mumax³ by opening Powershell in the `/deploy` directory and running ```bat ./deploy_windows.ps1 -CUDA_VERSIONS -CUDA_CC ``` where e.g. `` is `12.6` and `` is `86`, if you have installed CUDA v12.6 and your GPU's compute capability is 8.6. Your executable will be created in the `deploy/build` directory. * *Check installation with: `which mumax3` on **Linux** or `where.exe mumax3.exe` on **Windows**, followed by `mumax3 -test`.*
Troubleshooting: `cuda.h` or `curand.h` not found: →click here← This usually means that the `CGO_CFLAGS` and `CGO_LDFLAGS` environment variables are not found or point to the wrong path. To fix this, either define them in the script you are using to build mumax³, or define them in the terminal before running the script. * On **Windows:** say your CUDA is installed in `%CUDA_PATH%` (e.g. `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1`), then run these two lines in Powershell before running `deploy_windows.ps1`: ```powershell $env:CGO_CFLAGS = '-I "%CUDA_PATH%\include"' $env:CGO_LDFLAGS = '-L "%CUDA_PATH%\lib\x64"' ```
Troubleshooting: `mumax3.exe` is not generated: →click here← If, during the build process of mumax³, everything runs smoothly until you get the error that the `mumax3.exe` executable can not be found, try setting the `CGO_ENABLED` environment variable to `1` in your build script.
Troubleshooting: `vcvars64.bat` not found or could not initialise VC environment: →click here← CUDA requires Visual Studio to compile, which tries to set various environment variables. If Visual Studio fails to do so automatically, you can open a new shell, manually run the `vcvars64.bat` file there (the error message should contain the path to this Batch file), and then compile mumax using that shell.
3-3.11.1/bench/000077500000000000000000000000001503346766200130325ustar00rootroot000000000000003-3.11.1/bench/bench.mx3000066400000000000000000000007471503346766200145520ustar00rootroot00000000000000msat = 800e3 aex = 13e-12 alpha = 0.01 c := 4e-9 setcellsize(c, c, c) setsolver(2) for e:=5; e<14; e++{ n := pow(2, e) setgridsize(n, n, 1) print(n, "x", n) steps(1) // warm-up kernel b_ext = vector(0, 0.01, 0) m=uniform(1, 0, 0) // warm-up dt steps(3) m=uniform(1, 0, 0) // start! t = 0 start := now() neval0 := Neval.get() steps(100) wall := since(start).Seconds() nevl := Neval.get() - neval0 N2 := n*n fprintln("benchmark.txt", N2, N2*nevl/wall, t/nevl) } 3-3.11.1/bench/gpus.gplot000077500000000000000000000011641503346766200150640ustar00rootroot00000000000000#! /usr/bin/gnuplot # Width for decent spacing: 100 + 10*NUM_GPU_BENCHMARKS set term svg size 650, 470 font "DejaVu Sans,11" set output "gpus.svg" set encoding utf8 set boxwidth 0.6 set style fill solid noborder set key off set ylabel "throughput (M cells/s)" set xtics rotate by -90 set xtics scale 0 #set xtics out offset 0,-1.2 #set title "Mumax3 GPU performance for 2D simulations containing 4 million cells" font ",12" textcolor "#888888" #set logscale y 10 set yrange[1:2000] plot "gpus.txt" u ($0+1):($2/1e6):xtic(4) w boxes title " ", "oommf4M.txt" u (0):(4*$1**2 * $2 /$3/1e6):xtic("OOMMF (CPU)") w boxes set output 3-3.11.1/bench/gpus.txt000066400000000000000000000077501503346766200145620ustar00rootroot000000000000004.194304e+06 2.6580353657595888e+07 1.6945198787132785e-14 "940MX" 4.194304e+06 3.4870614414186165e+07 1.694520362670856e-14 "MX250" 4.194304e+06 3.497176955517716e+07 1.694520362670856e-14 "MX150" 4.194304e+06 5.527840160261479e+07 1.6945213680693907e-14 "GTX 860M" 4.194304e+06 7.260693991786541e+07 1.694520362670856e-14 "GTX 1050 (mobile)" 4.194304e+06 8.191301445972674e+07 1.694520362670856e-14 "GTX 1050Ti (mobile)" 4.194304e+06 8.490111356520656e+07 1.6945203523231987e-14 "RTX 2050 (mobile)" 4.194304e+06 9.754137844792007e+07 1.6945208085986004e-14 "GTX 1650 (mobile)" 4.194304e+06 1.205878382259874e+08 1.6945213680693907e-14 "GTX 970" 4.194304e+06 1.2801637602062301e+08 1.6945217319241e-14 "GTX 1060" 4.194304e+06 1.282698958240901e+08 1.6945224358062508e-14 "GTX 1060 (mobile)" 4.194304e+06 1.3222269984079185e+08 1.6945213680693907e-14 "GTX 980" 4.194304e+06 1.4212466910080725e+08 1.6945198787132785e-14 "Quadro M5000" 4.194304e+06 1.6411238877464202e+08 1.6945208085986004e-14 "GTX 1660" 4.194304e+06 1.702974441584964e+08 1.6945224358062508e-14 "GTX 1070" 4.194304e+06 1.8968187897582138e+08 1.6945207552204512e-14 "GTX 1080" 4.194304e+06 1.90372141272333e+08 1.6945210988316847e-14 "Tesla T4" 4.194304e+06 1.974787604705023e+08 1.6945207552204512e-14 "Tesla M40" 4.194304e+06 1.9961744689897743e+08 1.6945207552204512e-14 "GTX 980 Ti" 4.194304e+06 2.040880708528955e+08 1.6945207552204512e-14 "RTX 2000 Ada" 4.194304e+06 2.063497693886217e+08 1.6945203523231987e-14 "RTX 3050 Ti" 4.194304e+06 2.0445838079431638e+08 1.6945210988316847e-14 "Quadro P5000" 4.194304e+06 2.3944869412188095e+08 1.6945207552204512e-14 "Tesla P40" 4.194304e+06 2.474492276179104e+08 1.6945216799964015e-14 "GTX 1660 Ti" 4.194304e+06 2.747775864824991e+08 1.694521108780564e-14 "GTX TITAN X (Pascal)" 4.194304e+06 2.7516254149838316e+08 1.6945224358062508e-14 "GTX 1080 Ti" 4.194304e+06 2.846817297049518e+08 1.6945216799964015e-14 "RTX 2060" 4.194304e+06 2.977666528431264e+08 1.6945208085986004e-14 "RTX 2070" 4.194304e+06 2.9988944905569583e+08 1.6945203523231987e-14 "RTX 4060 Ti" 4.194304e+06 3.230257416451932e+08 1.6945203523231987e-14 "RTX A4000 (ECC On)" 4.194304e+06 3.413751301174529e+08 1.6945224358062508e-14 "GTX TITAN Xp" 4.194304e+06 3.6200963642011607e+08 1.6945207552204512e-14 "Tesla P100" 4.194304e+06 3.715163455786957e+08 1.6945208085986004e-14 "RTX 2060 SUPER" 4.194304e+06 3.798252250152153e+08 1.6945216799964015e-14 "RTX 2080" 4.194304e+06 3.987991717041427e+08 1.6945207552204512e-14 "Tesla P100 SXM2" 4.194304e+06 4.008247314332439e+08 1.6945216799964015e-14 "RTX 3080 (mobile)" 4.194304e+06 4.244706977117220e+08 1.6945216799964015e-14 "RTX 2080 SUPER" 4.194304e+06 4.283830343840416e+08 1.6945210988316847e-14 "RTX 2080 Ti" 4.194304e+06 4.3533767228392416e+08 1.6945203523231978e-14 "RTX 3060 Ti" 4.194304e+06 4.687789231921795e+08 1.6945210988316847e-14 "TITAN V" 4.194304e+06 4.987517543189467e+08 1.6945203523231987e-14 "RTX 4070" 4.194304e+06 5.0719408253287315e+08 1.6945216799964015e-14 "RTX 2080 Ti OC" 4.194304e+06 5.295717591997097e+08 1.6945216799964015e-14 "TITAN RTX" 4.194304e+06 5.470237783548311e+08 1.6945203523231987e-14 "RTX 3070 Ti Lite Hash Rate" 4.194304e+06 5.581471966505249e+08 1.6945203523231987e-14 "RTX A6000 (ECC On)" 4.194304e+06 5.74603595118363e+08 1.6945205118530127e-14 "A40" 4.194304e+06 6.118821270070031e+08 1.6945207552204512e-14 "V100-SXM2-32GB" 4.194304e+06 6.87872396290744e+08 1.6945203523231987e-14 "RTX 3080" 4.194304e+06 7.078338805905465e+08 1.6945203523231987e-14 "RTX A6000 (ECC Off)" 4.194304e+06 7.557652428864068e+08 1.6945203523231987e-14 "RTX 3080 (12Gb)" 4.194304e+06 8.359251312500654e+08 1.6945203523231987e-14 "RTX 3090" 4.194304e+06 9.514998390683259e+08 1.6945210988316847e-14 "A100" 4.194304e+06 1.2279813058227684e+09 1.6945203523231987e-14 "RTX 4090" 4.194304e+06 1.917053016568692e+09 1.6945203523231987e-14 "RTX 5090" 4.194304e+06 1.9800477627588277e+09 1.6945198886452648e-14 "H100" 3-3.11.1/bench/oommf4M.txt000066400000000000000000000000161503346766200151060ustar00rootroot000000000000002048 3 35 3-3.11.1/cmd/000077500000000000000000000000001503346766200125165ustar00rootroot000000000000003-3.11.1/cmd/gccgorun000077500000000000000000000003531503346766200142540ustar00rootroot00000000000000#! /bin/bash # wrapper for "go run" using gccgo with flags for speed. echo go run -compiler=gccgo -gccgoflags '-static-libgcc -O4 -Ofast -march=native' $@ go run -compiler=gccgo -gccgoflags '-static-libgcc -O4 -Ofast -march=native' $@ 3-3.11.1/cmd/mumax3-convert/000077500000000000000000000000001503346766200154065ustar00rootroot000000000000003-3.11.1/cmd/mumax3-convert/.gitignore000066400000000000000000000000241503346766200173720ustar00rootroot00000000000000main mumax3-convert 3-3.11.1/cmd/mumax3-convert/Makefile000066400000000000000000000000241503346766200170420ustar00rootroot00000000000000all: go install -v 3-3.11.1/cmd/mumax3-convert/csv.go000066400000000000000000000006711503346766200165340ustar00rootroot00000000000000package main import ( "fmt" "io" "github.com/mumax/3/data" ) // comma-separated values func dumpCSV(f *data.Slice, info data.Meta, out io.Writer) { f2 := ", " + *flag_format a := f.Tensors() for _, a := range a { for _, a := range a { for _, a := range a { fmt.Fprintf(out, *flag_format, a[0]) for i := 1; i < len(a); i++ { fmt.Fprintf(out, f2, a[i]) } fmt.Fprintln(out) } fmt.Fprintln(out) } } } 3-3.11.1/cmd/mumax3-convert/gnuplot.go000066400000000000000000000015621503346766200174310ustar00rootroot00000000000000package main // Output for gnuplot's "splot" import ( "bufio" "fmt" "io" "github.com/mumax/3/data" ) const DELIM = "\t" func dumpGnuplot(f *data.Slice, m data.Meta, out io.Writer) { buf := bufio.NewWriter(out) defer buf.Flush() data := f.Tensors() cellsize := m.CellSize // If no cell size is set, use generic cell index. if cellsize == [3]float64{0, 0, 0} { cellsize = [3]float64{1, 1, 1} } ncomp := f.NComp() for iz := range data[0] { z := float64(iz) * cellsize[Z] for iy := range data[0][iz] { y := float64(iy) * cellsize[Y] for ix := range data[0][iz][iy] { x := float64(ix) * cellsize[X] fmt.Fprint(buf, x, DELIM, y, DELIM, z, DELIM) for c := 0; c < ncomp-1; c++ { fmt.Fprint(buf, data[c][iz][iy][ix], DELIM) } fmt.Fprint(buf, data[ncomp-1][iz][iy][ix]) fmt.Fprint(buf, "\n") } fmt.Fprint(buf, "\n") } } } 3-3.11.1/cmd/mumax3-convert/json.go000066400000000000000000000002771503346766200167140ustar00rootroot00000000000000package main import ( "encoding/json" "io" "github.com/mumax/3/data" ) func dumpJSON(f *data.Slice, info data.Meta, out io.Writer) { w := json.NewEncoder(out) w.Encode(f.Tensors()) } 3-3.11.1/cmd/mumax3-convert/main.go000066400000000000000000000324751503346766200166740ustar00rootroot00000000000000/* mumax3-convert converts mumax3 output files to various formats and images. It also provides basic manipulations like data rescale etc. # Usage Command-line flags must always precede the input files: mumax3-convert [flags] files For an overview of flags, run: mumax3-convert -help ## Converting file formats In any mumax3-convert command, the output format must be specified. Multiple output formats can be provided at the same time. Use -show to print the output in the console (optional: format with -f). Saving to file supports file types -csv, -dump, -gif, -gplot, -jpg, -json, -numpy, -omf, -ovf, -ovf2, -png, -svg, -svgz and -vtk. Converting to OVF or OVF2 requires specifying "text" or "binary", VTK requires "ascii" or "binary". Example: convert all .ovf files to PNG: mumax3-convert -png *.ovf Example: convert legacy .dump files to binary .ovf: mumax3-convert -ovf2 binary *.dump Example: show file contents in the console to two decimal places, meanwhile convert to JPEG: mumax3-convert -show -f "%.2f" -jpg file.ovf ## Converting to image For scalar data, the color scale is automatically stretched to cover all values. The values corresponding to minimum and maximum color can be overridden by the -min and -max flags. Values falling outside of this range will be clipped. E.g. unit range: mumax3-convert -png -min=0 -max=1 file.ovf The default scalar color map is black,gray,white (minimum value maps to black, maximum to white). This can be overridden by -color. Valid colors are white, gray, black, transparent, (light)red, (light)green, (light)blue, (dark)yellow, (dark)cyan and (dark)magenta. E.g., a rather colorful map: mumax3-convert -png -color black,blue,cyan,green,yellow,red,white file.ovf For vector data, arrows can be shown in the image using -arrows. E.g. place arrows 16 pixels apart: mumax3-convert -png -arrows 16 file.ovf ## Manipulating data NOTE: when the output format is the same as the input file, you must specify an output directory using -o, which avoids overwriting the original file. Vector data can be normalized (-normalize) or rescaled such that the largest vector has unit length (-normpeak), e.g.: mumax3-convert -normalize -ovf binary -o "normalized" file.ovf A single component of a vector field can be selected (-comp), e.g. save only the X component when converting all .ovf files to VTK binary: mumax3-convert -comp 0 -vtk binary *.ovf The grid can be resized (-resize), e.g. resize data to a 32 x 32 x 1 mesh and convert the result to OOMMF binary output: mumax3-convert -resize 32x32x1 -ovf2 binary -o "resized" file.ovf A subset of the data can be cut out (-xrange, -yrange, -zrange) between min:max. NOTE: max is an exclusive bound, and bounds can be omitted (default: 0 lower bound, maximum upper bound). E.g.: mumax3-convert -xrange 50:100 -yrange :100 -png file.ovf Example: select the bottom layer mumax3-convert -zrange :1 -ovf2 binary -o "bottom" file.ovf Output file names are automatically assigned. */ package main import ( "compress/gzip" "flag" "fmt" "image/color" "io" "log" "os" "path" "path/filepath" "strconv" "strings" "github.com/mumax/3/data" "github.com/mumax/3/draw" "github.com/mumax/3/dump" "github.com/mumax/3/httpfs" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) var ( flag_comp = flag.String("comp", "", "Select a component of vector data. (0,1,2 or x,y,z)") flag_show = flag.Bool("show", false, "Human-readible output to stdout") flag_format = flag.String("f", "%v", "Printf format string") flag_png = flag.Bool("png", false, "PNG output") flag_jpeg = flag.Bool("jpg", false, "JPEG output") flag_gif = flag.Bool("gif", false, "GIF output") flag_svg = flag.Bool("svg", false, "SVG output") flag_svgz = flag.Bool("svgz", false, "SVGZ output (compressed)") flag_gnuplot = flag.Bool("gplot", false, "Gnuplot-compatible output") flag_ovf1 = flag.String("ovf", "", `"text" or "binary" OVF1 output`) flag_omf = flag.String("omf", "", `"text" or "binary" OVF1 output`) flag_ovf2 = flag.String("ovf2", "", `"text" or "binary" OVF2 output`) flag_vtk = flag.String("vtk", "", `"ascii" or "binary" VTK output`) flag_dump = flag.Bool("dump", false, `output in dump format`) flag_csv = flag.Bool("csv", false, `output in CSV format`) flag_numpy = flag.Bool("numpy", false, "Numpy output") flag_json = flag.Bool("json", false, `output in JSON format`) flag_min = flag.String("min", "auto", `Minimum of color scale: "auto" or value.`) flag_max = flag.String("max", "auto", `Maximum of color scale: "auto" or value.`) flag_normalize = flag.Bool("normalize", false, `Normalize vector data to unit length`) flag_normpeak = flag.Bool("normpeak", false, `Scale vector data, maximum to unit length`) flag_resize = flag.String("resize", "", "Resize. E.g.: 128x128x4") flag_cropx = flag.String("xrange", "", "Crop x range min:max (both optional, max=exclusive)") flag_cropy = flag.String("yrange", "", "Crop y range min:max (both optional, max=exclusive)") flag_cropz = flag.String("zrange", "", "Crop z range min:max (both optional, max=exclusive)") flag_dir = flag.String("o", "", "Save all output in this directory") flag_arrows = flag.Int("arrows", 0, "Arrow size for vector bitmap image output") flag_color = flag.String("color", "black,gray,white", "Colormap for scalar image output.") ) var ( colormap []draw.ColorMapSpec ) type task struct { *data.Slice info data.Meta fname string } func main() { log.SetFlags(0) flag.Parse() if flag.NArg() == 0 { log.Fatal("no input files") } colormap = make([]draw.ColorMapSpec, 1, 1) colormap[0].Cmap = parseColors(*flag_color) // politely try to make the output directory if *flag_dir != "" { _ = os.Mkdir(*flag_dir, 0777) } // determine which outputs we want var wantOut []output for flag, out := range outputs { if *flag { wantOut = append(wantOut, out) } } switch { case *flag_ovf1 != "": wantOut = append(wantOut, output{".ovf", outputOVF1}) case *flag_omf != "": wantOut = append(wantOut, output{".omf", outputOMF}) case *flag_ovf2 != "": wantOut = append(wantOut, output{".ovf", outputOVF2}) case *flag_vtk != "": wantOut = append(wantOut, output{".vts", outputVTK}) } if len(wantOut) == 0 && *flag_show == false { log.Fatal("no output format specified (e.g.: -png)") } // expand wildcards which are not expanded by the shell // (pointing a finger at cmd.exe) var fnames []string for _, input := range flag.Args() { fmt.Println(input) expanded, _ := filepath.Glob(input) fnames = append(fnames, expanded...) } // read all input files and put them in the task queue for _, fname := range fnames { for _, outp := range wantOut { fname := fname // closure caveats outp := outp Queue(func() { doFile(fname, outp) }) } } // wait for work to finish Wait() fmt.Println(succeeded, "files converted, ", skipped, "skipped, ", failed, "failed") if failed > 0 { os.Exit(1) } } var ( failed, skipped, succeeded util.Atom ) func doFile(infname string, outp output) { // determine output file outfname := util.NoExt(infname) + outp.Ext if *flag_dir != "" { outfname = filepath.Join(*flag_dir, filepath.Base(outfname)) } msg := infname + "\t-> " + outfname defer func() { log.Println(msg) }() if infname == outfname { msg = fail(msg, "input and output file are the same") return } defer func() { if err := recover(); err != nil { msg = fail(msg, err) os.Remove(outfname) } }() if !(strings.HasPrefix(infname, "http://") || strings.HasPrefix(outfname, "http://")) { inStat, errS := os.Stat(infname) if errS != nil { panic(errS) } outStat, errO := os.Stat(outfname) if errO == nil && outStat.ModTime().Sub(inStat.ModTime()) > 0 { msg = "[skip] " + msg + ": skipped based on time stamps" skipped.Add(1) return } } var slice *data.Slice var info data.Meta var err error in, errI := httpfs.Open(infname) if errI != nil { msg = fail(msg, errI) return } defer in.Close() switch path.Ext(infname) { default: msg = fail(msg, ": skipping unsupported type: "+path.Ext(infname)) return case ".ovf", ".omf", ".ovf2": slice, info, err = oommf.Read(in) case ".dump": slice, info, err = dump.Read(in) } if err != nil { msg = fail(msg, err) return } out, err := httpfs.Create(outfname) if err != nil { msg = fail(msg, err) return } defer out.Close() preprocess(slice) outp.Convert(slice, info, panicWriter{out}) succeeded.Add(1) msg = "[ ok ] " + msg } func fail(msg string, x ...interface{}) string { failed.Add(1) return "[fail] " + msg + ": " + fmt.Sprint(x...) } // writer that panics on error, so we don't have to check it type panicWriter struct { io.Writer } func (w panicWriter) Write(p []byte) (int, error) { n, err := w.Writer.Write(p) if err != nil { panic(err) } return n, nil } type output struct { Ext string Convert func(*data.Slice, data.Meta, io.Writer) } var outputs = map[*bool]output{ flag_png: {".png", renderPNG}, flag_jpeg: {".jpg", renderJPG}, flag_gif: {".gif", renderGIF}, flag_svg: {".svg", renderSVG}, flag_svgz: {".svgz", renderSVGZ}, flag_gnuplot: {".gplot", dumpGnuplot}, flag_dump: {".dump", outputDUMP}, flag_csv: {".csv", dumpCSV}, flag_numpy: {".npy", dumpNUMPY}, flag_json: {".json", dumpJSON}, flag_show: {"", show}, } func renderPNG(f *data.Slice, info data.Meta, out io.Writer) { draw.RenderFormat(out, f, *flag_min, *flag_max, *flag_arrows, ".png", colormap...) } func renderJPG(f *data.Slice, info data.Meta, out io.Writer) { draw.RenderFormat(out, f, *flag_min, *flag_max, *flag_arrows, ".jpg", colormap...) } func renderGIF(f *data.Slice, info data.Meta, out io.Writer) { draw.RenderFormat(out, f, *flag_min, *flag_max, *flag_arrows, ".gif", colormap...) } func renderSVG(f *data.Slice, info data.Meta, out io.Writer) { draw.SVG(out, f.Vectors()) } func renderSVGZ(f *data.Slice, info data.Meta, out io.Writer) { out2 := gzip.NewWriter(out) defer out2.Close() draw.SVG(out2, f.Vectors()) } func outputOVF1(f *data.Slice, info data.Meta, out io.Writer) { oommf.WriteOVF1(out, f, info, *flag_ovf1) } func outputOMF(f *data.Slice, info data.Meta, out io.Writer) { oommf.WriteOVF1(out, f, info, *flag_omf) } func outputOVF2(f *data.Slice, info data.Meta, out io.Writer) { oommf.WriteOVF2(out, f, info, *flag_ovf2) } func outputVTK(f *data.Slice, info data.Meta, out io.Writer) { dumpVTK(out, f, info, *flag_vtk) } func outputDUMP(f *data.Slice, info data.Meta, out io.Writer) { dump.Write(out, f, info) } // does not output to out, just prints to stdout func show(f *data.Slice, info data.Meta, out io.Writer) { fmt.Println(info) util.Fprintf(os.Stdout, *flag_format, f.Tensors()) } func preprocess(f *data.Slice) { if *flag_normalize { normalize(f, 1) } if *flag_normpeak { normpeak(f) } colormap[0].Ccomp = -1 if *flag_comp != "" { c := parseComp(*flag_comp) colormap[0].Ccomp = c if *flag_arrows == 0 { *f = *f.Comp(c) } } crop(f) if *flag_resize != "" { resize(f, *flag_resize) } } func parseComp(c string) int { if i, err := strconv.Atoi(c); err == nil { return i } switch c { default: log.Fatal("illegal component:", c, "(need x, y or z)") panic(0) case "x", "X": return 0 case "y", "Y": return 1 case "z", "Z": return 2 } } func crop(f *data.Slice) { N := f.Size() // default ranges x1, x2 := 0, N[X] y1, y2 := 0, N[Y] z1, z2 := 0, N[Z] havework := false if *flag_cropz != "" { z1, z2 = parseRange(*flag_cropz, N[Z]) havework = true } if *flag_cropy != "" { y1, y2 = parseRange(*flag_cropy, N[Y]) havework = true } if *flag_cropx != "" { x1, x2 = parseRange(*flag_cropx, N[X]) havework = true } if havework { *f = *data.Crop(f, x1, x2, y1, y2, z1, z2) } } func parseRange(r string, max int) (int, int) { a, b := 0, max spl := strings.Split(r, ":") if len(spl) != 2 { log.Fatal("range needs min:max syntax, have:", r) } if spl[0] != "" { a = atoi(spl[0]) } if spl[1] != "" { b = atoi(spl[1]) } return a, b } func atoi(a string) int { i, err := strconv.Atoi(a) if err != nil { panic(err) } return i } const ( X = data.X Y = data.Y Z = data.Z ) func parseColors(s string) (m []color.RGBA) { words := strings.Split(s, ",") for _, w := range words { m = append(m, parseColor(w)) } return } func parseColor(s string) color.RGBA { if c, ok := colors[s]; ok { return c } str := fmt.Sprintln("Refusing to use ugly color '" + s + "', options are:") for k := range colors { str += fmt.Sprintf("%s,", k) } util.Fatal(strings.Trim(str, ",")) return color.RGBA{} } var colors = map[string]color.RGBA{ "white": {R: 255, G: 255, B: 255, A: 255}, "black": {R: 0, G: 0, B: 0, A: 255}, "transparent": {R: 0, G: 0, B: 0, A: 0}, "red": {R: 255, G: 0, B: 0, A: 255}, "green": {R: 0, G: 255, B: 0, A: 255}, "blue": {R: 0, G: 0, B: 255, A: 255}, "lightred": {R: 255, G: 127, B: 127, A: 255}, "lightgreen": {R: 127, G: 255, B: 127, A: 255}, "lightblue": {R: 127, G: 127, B: 255, A: 255}, "yellow": {R: 255, G: 255, B: 0, A: 255}, "darkyellow": {R: 127, G: 127, B: 0, A: 255}, "cyan": {R: 0, G: 255, B: 255, A: 255}, "darkcyan": {R: 0, G: 127, B: 127, A: 255}, "magenta": {R: 255, G: 0, B: 255, A: 255}, "darkmagenta": {R: 127, G: 0, B: 127, A: 255}, "gray": {R: 127, G: 127, B: 127, A: 255}, } 3-3.11.1/cmd/mumax3-convert/normalize.go000066400000000000000000000021641503346766200177400ustar00rootroot00000000000000package main import ( "math" "github.com/mumax/3/data" ) // normalize vector data to given length func normalize(f *data.Slice, length float64) { a := f.Vectors() for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { x, y, z := a[0][i][j][k], a[1][i][j][k], a[2][i][j][k] norm := math.Sqrt(float64(x*x + y*y + z*z)) invnorm := float32(1) if norm != 0 { invnorm = float32(length / norm) } a[0][i][j][k] *= invnorm a[1][i][j][k] *= invnorm a[2][i][j][k] *= invnorm } } } } func normpeak(f *data.Slice) { a := f.Vectors() maxnorm := 0. for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { x, y, z := a[0][i][j][k], a[1][i][j][k], a[2][i][j][k] norm := math.Sqrt(float64(x*x + y*y + z*z)) if norm > maxnorm { maxnorm = norm } } } } scale(f, float32(1/maxnorm)) } func scale(f *data.Slice, factor float32) { a := f.Vectors() for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { a[0][i][j][k] *= factor a[1][i][j][k] *= factor a[2][i][j][k] *= factor } } } } 3-3.11.1/cmd/mumax3-convert/numpy.go000066400000000000000000000020521503346766200171040ustar00rootroot00000000000000package main import ( "encoding/binary" "fmt" "io" "github.com/mumax/3/data" ) func dumpNUMPY(f *data.Slice, info data.Meta, out io.Writer) { // see npy format: https://www.numpy.org/devdocs/reference/generated/numpy.lib.format.html // write the first 10 bytes of the 128 byte header fmt.Fprintf(out, "\x93NUMPY") // magic string fmt.Fprintf(out, "\x01\x00") // npy format version binary.Write(out, binary.LittleEndian, uint16(118)) // length of the actual header data (128-10) // write the actual header data (118 bytes) shapestr := fmt.Sprintf("(%d,%d,%d,%d)", f.NComp(), f.Size()[2], f.Size()[1], f.Size()[0]) headerData := fmt.Sprintf("{'descr': '") _, err = fmt.Fprintln(out, "") _, err = fmt.Fprintf(out, "\t\n", gridsize[0]-1, gridsize[1]-1, gridsize[2]-1) _, err = fmt.Fprintf(out, "\t\t\n", gridsize[0]-1, gridsize[1]-1, gridsize[2]-1) return } func writeVTKPoints(out io.Writer, q *data.Slice, dataformat string, info data.Meta) (err error) { _, err = fmt.Fprintln(out, "\t\t\t") fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", dataformat) gridsize := q.Size() cellsize := info.CellSize switch dataformat { case "ascii": for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { x := (float32)(i) * (float32)(cellsize[0]) y := (float32)(j) * (float32)(cellsize[1]) z := (float32)(k) * (float32)(cellsize[2]) _, err = fmt.Fprint(out, x, " ", y, " ", z, " ") } } } case "binary": buffer := new(bytes.Buffer) for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { x := (float32)(i) * (float32)(cellsize[0]) y := (float32)(j) * (float32)(cellsize[1]) z := (float32)(k) * (float32)(cellsize[2]) binary.Write(buffer, binary.LittleEndian, x) binary.Write(buffer, binary.LittleEndian, y) binary.Write(buffer, binary.LittleEndian, z) } } } b64len := uint32(len(buffer.Bytes())) bufLen := new(bytes.Buffer) binary.Write(bufLen, binary.LittleEndian, b64len) base64out := base64.NewEncoder(base64.StdEncoding, out) base64out.Write(bufLen.Bytes()) base64out.Write(buffer.Bytes()) base64out.Close() default: log.Fatalf("Illegal VTK data format: %v. Options are: ascii, binary", dataformat) } _, err = fmt.Fprintln(out, "\n\t\t\t\t") _, err = fmt.Fprintln(out, "\t\t\t") return } func writeVTKCellData(out io.Writer, q *data.Slice, meta data.Meta, dataformat string) (err error) { N := q.NComp() data := q.Tensors() switch N { case 1: fmt.Fprintf(out, "\t\t\t\n", meta.Name) fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", meta.Name, N, dataformat) case 3: fmt.Fprintf(out, "\t\t\t\n", meta.Name) fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", meta.Name, N, dataformat) case 6, 9: fmt.Fprintf(out, "\t\t\t\n", meta.Name) fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", meta.Name, 9, dataformat) // must be 9! default: log.Fatalf("vtk: cannot handle %v components", N) } gridsize := q.Size() switch dataformat { case "ascii": for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { // if symmetric tensor manage it separately to write the full 9 components if N == 6 { fmt.Fprint(out, data[0][k][j][i], " ") fmt.Fprint(out, data[1][k][j][i], " ") fmt.Fprint(out, data[2][k][j][i], " ") fmt.Fprint(out, data[1][k][j][i], " ") fmt.Fprint(out, data[3][k][j][i], " ") fmt.Fprint(out, data[4][k][j][i], " ") fmt.Fprint(out, data[2][k][j][i], " ") fmt.Fprint(out, data[4][k][j][i], " ") fmt.Fprint(out, data[5][k][j][i], " ") } else { for c := 0; c < N; c++ { fmt.Fprint(out, data[c][k][j][i], " ") } } } } } case "binary": // Inlined for performance, terabytes of data will pass here... buffer := new(bytes.Buffer) for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { // if symmetric tensor manage it separately to write the full 9 components if N == 6 { binary.Write(buffer, binary.LittleEndian, data[0][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[1][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[2][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[1][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[3][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[4][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[2][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[4][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[5][k][j][i]) } else { for c := 0; c < N; c++ { binary.Write(buffer, binary.LittleEndian, data[c][k][j][i]) } } } } } b64len := uint32(len(buffer.Bytes())) bufLen := new(bytes.Buffer) binary.Write(bufLen, binary.LittleEndian, b64len) base64out := base64.NewEncoder(base64.StdEncoding, out) base64out.Write(bufLen.Bytes()) base64out.Write(buffer.Bytes()) base64out.Close() default: panic(fmt.Errorf("vtk: illegal data format " + dataformat + ". Options are: ascii, binary")) } fmt.Fprintln(out, "\n\t\t\t\t") fmt.Fprintln(out, "\t\t\t") return } func writeVTKFooter(out io.Writer) (err error) { _, err = fmt.Fprintln(out, "\t\t") _, err = fmt.Fprintln(out, "\t") _, err = fmt.Fprintln(out, "") return } 3-3.11.1/cmd/mumax3-httpfsd/000077500000000000000000000000001503346766200154025ustar00rootroot000000000000003-3.11.1/cmd/mumax3-httpfsd/Makefile000066400000000000000000000000211503346766200170330ustar00rootroot00000000000000all: go install 3-3.11.1/cmd/mumax3-httpfsd/main.go000066400000000000000000000013321503346766200166540ustar00rootroot00000000000000/* httpfs server, useful for debugging mumax3-server. # Usage Start mumax3-httpfsd in a certain working directory. $ ls file.mx3 $ mumax3-server -l :35362 Then you can remotely run mumax3 input files: $ cd elsewhere $ mumax3 http://localhost:35362/file.mx3 */ package main import ( "flag" "log" "net/http" _ "net/http/pprof" "github.com/mumax/3/httpfs" ) var ( flag_addr = flag.String("l", ":35360", "Listen and serve at this network address") flag_log = flag.Bool("log", false, "log debug output") ) func main() { flag.Parse() log.Println("serving at", *flag_addr) httpfs.Logging = *flag_log httpfs.RegisterHandlers() err := http.ListenAndServe(*flag_addr, nil) if err != nil { log.Fatal(err) } } 3-3.11.1/cmd/mumax3-plot/000077500000000000000000000000001503346766200147045ustar00rootroot000000000000003-3.11.1/cmd/mumax3-plot/.gitignore000066400000000000000000000000141503346766200166670ustar00rootroot00000000000000mumax3-plot 3-3.11.1/cmd/mumax3-plot/Makefile000066400000000000000000000000241503346766200163400ustar00rootroot00000000000000all: go install -v 3-3.11.1/cmd/mumax3-plot/main.go000066400000000000000000000044451503346766200161660ustar00rootroot00000000000000/* The mumax3-plot utility uses gnuplot to automatically plot mumax3 data tables. mumax3-plot table.txt Creates graphs of all columns as .svg files. */ package main import ( "bufio" "flag" "fmt" "log" "os" "os/exec" "path" "strings" ) func main() { log.SetFlags(0) flag.Parse() for _, f := range flag.Args() { plotFile(f) } } func plotFile(fname string) { hdr := readHeader(fname) // quantities grouped by vector Qs := []*Q{{[]string{"t"}, "s", []int{1}}} prev := Qs[0] quants := strings.Split(hdr, "\t") for i := 1; i < len(quants); i++ { spl := strings.Split(quants[i], " ") name := spl[0] unit := spl[1] if unit == "()" { unit = "" } if name[:len(name)-1] == prev.name[0][:len(prev.name[0])-1] { prev.cols = append(prev.cols, i+1) prev.name = append(prev.name, name) } else { n := &Q{[]string{name}, unit, []int{i + 1}} Qs = append(Qs, n) prev = n } } log.Println(Qs) for i := 1; i < len(Qs); i++ { makePlot(fname, Qs[i]) } } func makePlot(fname string, q *Q) { term := "svg" outf := path.Dir(fname) + "/" + q.vecname() cmd := fmt.Sprintf(`set term %v noenhanced size 400 300 font 'Arial,10'; set output "%v.%v";`, term, outf, term) cmd += fmt.Sprintf(`set xlabel "t(ns)";`) cmd += fmt.Sprintf(`set ylabel "%v %v";`, q.vecname(), q.unit) cmd += fmt.Sprint(`set format y "%g";`) cmd += fmt.Sprint(`plot "`, fname, `" u ($1*1e9):`, q.cols[0], ` w li title "`, q.name[0], `"`) for i := 1; i < len(q.cols); i++ { cmd += fmt.Sprint(`, "`, fname, `" u ($1*1e9):`, q.cols[i], ` w li title "`, q.name[i], `"`) } cmd += "; set output;" out, err := exec.Command("gnuplot", "-e", cmd).CombinedOutput() os.Stderr.Write(out) check(err) } type Q struct { name []string unit string cols []int } func (q *Q) String() string { return fmt.Sprint(q.name, "(", q.unit, ")", q.cols) } func (q *Q) vecname() string { if len(q.cols) > 1 { return q.name[0][:len(q.name[0])-1] } else { return q.name[0] } } func readHeader(fname string) string { f, err := os.Open(fname) check(err) defer f.Close() in := bufio.NewReader(f) hdrBytes, _, err2 := in.ReadLine() check(err2) hdr := string(hdrBytes) if hdr[0] != '#' { log.Fatal("invalid table header:", hdr) } hdr = hdr[2:] return hdr } func check(err error) { if err != nil { log.Fatal(err) } } 3-3.11.1/cmd/mumax3-script/000077500000000000000000000000001503346766200152325ustar00rootroot000000000000003-3.11.1/cmd/mumax3-script/.gitignore000066400000000000000000000000161503346766200172170ustar00rootroot00000000000000mumax3-script 3-3.11.1/cmd/mumax3-script/Makefile000066400000000000000000000000241503346766200166660ustar00rootroot00000000000000all: go install -v 3-3.11.1/cmd/mumax3-script/main.go000066400000000000000000000022311503346766200165030ustar00rootroot00000000000000/* Toy interpreter executes scripts or stdin. */ package main import ( "bufio" "flag" "fmt" "github.com/mumax/3/script" "io" "log" "os" ) var debug = flag.Bool("g", false, "print debug output") var ( world *script.World ps1 string ) func main() { log.SetFlags(0) flag.Parse() world = script.NewWorld() world.Func("exit", exit) script.Debug = *debug if flag.NArg() > 1 { check(fmt.Errorf("need 0 or 1 input files")) } if flag.NArg() == 1 { src, err := os.Open(flag.Arg(0)) check(err) ps1 = ">" interpret(src) } else { ps1 = "" interpret(os.Stdin) } } func interpret(in io.Reader) { scanner := bufio.NewScanner(in) for scanner.Scan() { safecall(scanner.Text()) } check(scanner.Err()) } func safecall(code string) { if code == "" { return } defer func() { err := recover() if err != nil { fmt.Fprintln(os.Stderr, "panic:", err) } }() tree, err := world.Compile(code) if err == nil { for _, stmt := range tree.Child() { fmt.Println(stmt.Eval()) } } else { fmt.Fprintln(os.Stderr, err) } } func check(e error) { if e != nil { fmt.Fprintln(os.Stderr, e) os.Exit(1) } } func exit() { os.Exit(0) } 3-3.11.1/cmd/mumax3-script/mumax3-int000077500000000000000000000000551503346766200171620ustar00rootroot00000000000000#! /bin/bash rlwrap -m -S '> ' mumax3-script 3-3.11.1/cmd/mumax3-server/000077500000000000000000000000001503346766200152345ustar00rootroot000000000000003-3.11.1/cmd/mumax3-server/Makefile000066400000000000000000000000211503346766200166650ustar00rootroot00000000000000all: go install 3-3.11.1/cmd/mumax3-server/compute.go000066400000000000000000000142211503346766200172370ustar00rootroot00000000000000package main /* Compute service runs jobs on this node's GPUs, if any. */ import ( "fmt" "io" "log" "os/exec" "strings" "time" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) var ( MumaxVersion string GPUs []string Processes = make(map[string]*Process) // job id -> process ) // Process is a running simulation process type Process struct { *exec.Cmd Start time.Time Out io.WriteCloser ID string OutputURL string GUI string Killed bool } func (p *Process) Host() string { return JobHost(p.OutputURL) } // Runs a compute service on this node, if GPUs are available. // The compute service asks storage nodes for a job, runs it, // saves results over httpfs and notifies storage when ready. func RunComputeService() { if len(GPUs) == 0 { return } // queue of available GPU numbers idle := make(chan int, len(GPUs)) for i := range GPUs { idle <- i } for { gpu := <-idle // take an available GPU GUIAddr := fmt.Sprint(thisHost+":", GUI_PORT+gpu) ID := WaitForJob() // take an available job go func() { defer func() { // remove from "running" list WLock() delete(Processes, ID) WUnlock() // add GPU number back to idle stack idle <- gpu }() p := NewProcess(ID, gpu, GUIAddr) if p == nil { return } WLock() Processes[ID] = p WUnlock() p.Run() _, err := RPCCall(JobHost(ID), "UpdateJob", ID) if err != nil { log.Println(err) } }() } } func WaitForJob() string { ID := FindJob() for ID == "" { time.Sleep(2 * time.Second) // TODO: don't poll ID = FindJob() } return ID } func FindJob() string { // quickly list peers first RLock() p := make([]string, 0, len(peers)) for addr := range peers { p = append(p, addr) } RUnlock() // TODO: pick peers fairly // then do slow RPC calls without blocking the rest of the program for _, addr := range p { ID, _ := RPCCall(addr, "GiveJob", thisAddr) if ID != "" { return ID } } return "" } // RPC-callable function kills job corresponding to given job id. // The job has to be running on this node. func Kill(id string) string { log.Println("KILL", id) WLock() // modifies Cmd state defer WUnlock() job := Processes[id] if job == nil { return fmt.Sprintf("kill %v: job not running.", id) } job.Killed = true err := job.Cmd.Process.Kill() if err != nil { return err.Error() } return "" // OK } // prepare exec.Cmd to run mumax3 compute process func NewProcess(ID string, gpu int, webAddr string) *Process { // prepare command inputURL := "http://" + ID command := *flag_mumax gpuFlag := fmt.Sprint(`-gpu=`, gpu) httpFlag := fmt.Sprint(`-http=`, webAddr) cacheFlag := fmt.Sprint(`-cache=`, *flag_cachedir) forceFlag := `-f=0` cmd := exec.Command(command, gpuFlag, httpFlag, cacheFlag, forceFlag, inputURL) // Pipe stdout, stderr to log file over httpfs outDir := util.NoExt(inputURL) + ".out" errMkdir := httpfs.Mkdir(outDir) if errMkdir != nil { SetJobError(ID, errMkdir) log.Println("makeProcess", errMkdir) j := JobByName(ID) if j != nil { j.Reque() } return nil } out, errD := httpfs.Create(outDir + "/stdout.txt") if errD != nil { SetJobError(ID, errD) log.Println("makeProcess", errD) j := JobByName(ID) if j != nil { j.Reque() } return nil } cmd.Stderr = out cmd.Stdout = out return &Process{ID: ID, Cmd: cmd, Start: time.Now(), Out: out, OutputURL: OutputDir(inputURL), GUI: webAddr} } func (p *Process) Run() { log.Println("=> exec ", p.Path, p.Args) defer p.Out.Close() httpfs.Put(p.OutputURL+"host", []byte(thisAddr)) startTime := AskTime(p.Host()) httpfs.Put(p.OutputURL+"start", []byte(startTime.Format(time.UnixDate))) WLock() // Cmd.Start() modifies state err1 := p.Cmd.Start() // err? WUnlock() if err1 != nil { SetJobError(p.ID, err1) } timeOffset := time.Now().Sub(startTime) // our clock is most likely out-of-sync with host tick := time.NewTicker(KeepaliveInterval) // need initial alive in case watchdog sniffs between start and first alive tick httpfs.Put(p.OutputURL+"alive", []byte(time.Now().Add(timeOffset).Format(time.UnixDate))) go func() { for t := range tick.C { httpfs.Put(p.OutputURL+"alive", []byte(t.Add(timeOffset).Format(time.UnixDate))) } }() err2 := p.Cmd.Wait() if err1 == nil && err2 != nil { SetJobError(p.ID, err2) } tick.Stop() status := -1 // TODO: determine proper status number if err1 != nil || err2 != nil { log.Println(p.Path, p.Args, err1, err2) status = 1 } else { status = 0 } if p.Killed { httpfs.Put(p.OutputURL+"killed", []byte(time.Now().Format(time.UnixDate))) } else { httpfs.Put(p.OutputURL+"exitstatus", []byte(fmt.Sprint(status))) } stopTime := AskTime(p.Host()) nanos := stopTime.Sub(startTime).Nanoseconds() httpfs.Put(p.OutputURL+"duration", []byte(fmt.Sprint(nanos))) if status == 0 { ret, err := RPCCall(p.Host(), "AddFairShare", JobUser(p.ID)+"/"+fmt.Sprint(nanos/1e9)) if err != nil || ret != "" { log.Println("***ERR: AddFairShare", JobUser(p.ID), ret, err) } } return } func (p *Process) Duration() time.Duration { return Since(time.Now(), p.Start) } func DetectGPUs() { if GPUs != nil { panic("multiple DetectGPUs() calls") } for i := 0; i < MAXGPU; i++ { gpuflag := fmt.Sprint("-gpu=", i) out, err := exec.Command(*flag_mumax, "-test", gpuflag).Output() if err == nil { info := string(out) if strings.HasSuffix(info, "\n") { info = info[:len(info)-1] } log.Println("gpu", i, ":", info) GPUs = append(GPUs, info) } } } func DetectMumax() { out, err := exec.Command(*flag_mumax, "-test", "-v").CombinedOutput() info := string(out) if err == nil { split := strings.SplitN(info, "\n", 2) version := split[0] log.Println("have", version) MumaxVersion = version } else { MumaxVersion = fmt.Sprint(*flag_mumax, "-test", ": ", err, info) } } // RPC-callable function, answers by this node's time func WhatsTheTime(string) string { return time.Now().Format(time.UnixDate) } func AskTime(host string) time.Time { str, _ := RPCCall(host, "WhatsTheTime", "") return parseTime(str) } func parseTime(str string) time.Time { t, _ := time.Parse(time.UnixDate, str) return t } 3-3.11.1/cmd/mumax3-server/doc.go000066400000000000000000000111201503346766200163230ustar00rootroot00000000000000/* Easy-to-use cluster management tool for mumax3, with auto-configuration and web interface. When nodes are connected behind a home router, mumax3-server can run without any configuration. Otherwise only the IP address range where the other nodes reside has to be specified. # Input files Upon starting mumax3-server, it scans the current working directory for input files. These should be organised in directories corresponding to user names. E.g.: john/file1.mx3 john/file2.mx3 ... kate/file1.mx3 kate/file2.mx3 ... Other files will be ignored. These input files will run on all available nodes in the network. After adding/removing files, you should click "rescan" in the web interface, or wait for a few minutes. # Web interface mumax3-server serves a web interface at http://localhost:35360 (you have overridden the port, see below). Depending on your OS you may need to use your exact IP address instead of localhost, e.g.: http://192.168.0.1:35360. The web interface shows you the queued jobs, running jobs, output files, etc., and allows to re-scan for new job files or kill running jobs. # Compute nodes Each node that runs mumax3-server and has a working mumax3 installation will automatically serve as a compute node (even if it stores input files as well). The web interface will show the mumax version and available GPUs. The -exec flag may be used to override which mumax3 binary to use. E.g: mumax3-server -exec /usr/local/mumax3/mumax3-cuda6.5 #override mumax3 binary # Scan for other nodes Upon starting mumax3-server, it will automatically scan for other nodes in the local network. These will automatically start running jobs (if they have a GPU and mumax3 installed), or may serve job files to be executed by other nodes. By default, we search for nodes with IP addresses in the range 192.168.0.1-128 (local network behind, e.g., a router). This can be changed by the -scan flag. E.g.: mumax3-server -scan 127.0.0.1,169.254.0-1.1-254 mumax3-server -ports 35360-25369 Even when a new node appears on the network after the port scan, it should still be automatically detected. If not, hit "rescan" in the web interface. The -ports flag may be used to change the port numbers being scanned, in case the server uses a non-standard port (-l flag). # Override port number mumax3-server uses tcp port 35360, which needs to be accessible (e.g., through your firewall). This port and the service's IP address, can be overridden with the -l flag: mumax3-server -l :35361 #serves at non-standard port mumax3-server -l 192.168.1.1:35360 #serves at specific IP address, e.g. for dual-link machines # Fault tolerance mumax3-server does a great effort to recover from failed nodes, network outages, reboots etc. If a simulation is interrupted for any such reason, it should be re-queued and automatically re-started later. In that case the web interface will show [1x requeued] to indicate that the job has been interrupted, but it will run later nevertheless. # Command line flags Usage of mumax3-server: -cache="": mumax3 kernel cache path -exec="mumax3": mumax3 executable -halflife=24h0m0s: share decay half-life -l=":35360": Listen and serve at this network address -log=true: log debug output -ports="35360-35361": Scan these ports for other servers -scan="192.168.0.1-128": Scan these IP address for other servers -timeout=2s: Portscan timeout # Web interface example http://localhost:35360 157.193.57.146:35360 Uptime: 27h45m38s Peer nodes scan 157.193.57.2-254: 35360-35361 ports 35360-35361 (Rescan) 157.193.57.146:35360 157.193.57.228:35360 Compute service mumax: mumax 3.6 beta2 linux_amd64 go1.3.3 (gc) GPU0: CUDA 6 GeForce GTX 680(2047MB) cc3.0 GPU1: CUDA 6 GeForce GTX 680(2047MB) cc3.0 GPU2: CUDA 6 GeForce GTX 680(2047MB) cc3.0 Running jobs [157.193.57.146:35360/john/b_ext_add.mx3] [3s] [GUI] [kill] [157.193.57.146:35360/john/demag2D.mx3] [2s] [GUI] [kill] [157.193.57.146:35360/john/demag2Dpbc.mx3] [1s] [GUI] [kill] Queue service Users john 589 GPU-seconds has queued jobs kate 0 GPU-seconds no queued jobs Next job for: john Jobs [Reload all] [Wake-up Watchdog] john [Reload] [john/anisenergy.mx3] [.out] [157.193.57.146:35360] [ OK ] [1s] [john/anisenergyconservation.mx3] [.out] [157.193.57.146:35360] [ OK ] [2s] [john/anisenergyconservation2.mx3] [.out] [157.193.57.146:35360] [ OK ] [2s] [john/anisenergyconservation3.mx3] [.out] [157.193.57.228:35360] [ OK ] [1s] [john/anisenergyconservation4.mx3] [.out] [157.193.57.146:35360] [ OK ] [2s] kate [Reload] */ package main 3-3.11.1/cmd/mumax3-server/job.go000066400000000000000000000125621503346766200163430ustar00rootroot00000000000000package main import ( "log" "os" "strconv" "time" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) const MaxRequeue = 10 // maximum number of re-queues, don't run job if re-queued too many times // compute Job type Job struct { ID string // host/path of the input file, e.g., hostname:port/user/inputfile.mx3 // in-memory properties: RequeCount int // how many times requeued. Error interface{} // error that cannot be consolidated to disk // all of this is cache: Output string // if exists, points to output ID Host string // node address in host file (=last host who started this job) ExitStatus string // what's in the exitstatus file Start time.Time // When this job was started, if applicable Alive time.Time // Last time when this job was seen alive duration time.Duration } // Find job belonging to ID func JobByName(ID string) *Job { user := Users[BaseDir(LocalPath(ID))] if user == nil { log.Println("JobByName: no user for", ID) return nil } jobs := user.Jobs low := 0 high := len(jobs) - 1 mid := -1 for low <= high { mid = (low + high) / 2 switch { case jobs[mid].ID > ID: high = mid - 1 case jobs[mid].ID < ID: low = mid + 1 default: low = high + 1 // break for loop :-( } } if mid >= 0 && mid < len(jobs) && jobs[mid].ID == ID { return jobs[mid] } else { log.Println("JobByName: not found:", ID) return nil } } // read job files from storage and update status cache func (j *Job) Update() { out := j.LocalOutputDir() if exists(out) { j.Output = thisAddr + "/" + out } else { j.Output = "" j.ExitStatus = "" j.Start = time.Time{} j.Alive = time.Time{} j.duration = 0 } if j.Output != "" { j.Host = httpfsRead(out + "host") j.ExitStatus = httpfsRead(out + "exitstatus") j.Start = parseTime(httpfsRead(out + "start")) j.Alive = parseTime(httpfsRead(out + "alive")) j.duration = time.Duration(atoi(httpfsRead(out + "duration"))) } } // Put job back in queue for later, e.g., when killed. func (j *Job) Reque() { log.Println("requeue", j.ID) j.RequeCount++ httpfs.Remove(j.LocalOutputDir()) j.Update() } func SetJobError(ID string, err interface{}) { log.Println("SetJobErr", ID, err) WLock() defer WUnlock() j := JobByName(ID) if j == nil { return } j.Error = err } // How long job has been running, if running. func (j *Job) Duration() time.Duration { if j.Start.IsZero() { return 0 } if j.duration != 0 { return j.duration } if j.IsRunning() { return Since(time.Now(), j.Start) } return 0 // unknown duration } // user name for this job ID func (j *Job) User() string { return JobUser(j.ID) } // user name for this job ID func JobUser(ID string) string { return BaseDir(LocalPath(ID)) } // local path of input file func (j *Job) LocalPath() string { return LocalPath(j.ID) } // local path of input file, without host prefix. E.g.: // // host:123/user/file.mx3 -> user/file.mx3 func LocalPath(ID string) string { host := JobHost(ID) if len(host)+1 >= len(ID) { log.Println("Invalid LocalPath call on", ID) return "" } return ID[len(host)+1:] } // local path of output dir func (j *Job) LocalOutputDir() string { return OutputDir(j.LocalPath()) } // output directory for input file func OutputDir(path string) string { return util.NoExt(path) + ".out/" } // insert "/fs" in front of url path func (*Job) FS(id string) string { return FS(id) } // insert "/fs" in front of url path func FS(id string) string { return BaseDir(id) + "/fs/" + LocalPath(id) } // is job queued? func (j *Job) IsQueued() bool { return j.Output == "" && j.RequeCount < MaxRequeue } // is job running? func (j *Job) IsRunning() bool { return j.Output != "" && j.ExitStatus == "" && j.Host != "" } // Host of job with this ID (=first path element). E.g.: // // host:123/user/file.mx3 -> host:123 func JobHost(ID string) string { return BaseDir(ID) } // Job status number queued, running,... type Status int const ( QUEUED Status = iota RUNNING FINISHED FAILED ) var statusString = map[Status]string{ QUEUED: "QUEUED", RUNNING: "RUNNING", FINISHED: "FINISHED", FAILED: "FAILED", } func (s Status) String() string { return statusString[s] } // human-readable status string (for gui) func (j *Job) Status() string { if j.IsQueued() { return QUEUED.String() } if j.ExitStatus == "0" { return FINISHED.String() } if j.ExitStatus == "" && j.Host == "" { return FINISHED.String() } if j.Host != "" && j.ExitStatus == "" { return RUNNING.String() } if j.ExitStatus != "" && j.ExitStatus != "0" { return FAILED.String() } return "UNKNOWN" } // remove job output func Rm(URL string) string { err := httpfs.Remove("http://" + OutputDir(URL)) // update status after output removal UpdateJob(URL) if err != nil { return err.Error() } // report re-queue // handy if others remove your jobs job := JobByName(URL) if job != nil { job.RequeCount++ } // make sure job runs again quickly user := JobUser(URL) u := Users[user] if u != nil { u.nextPtr = 0 } return "" } // check if path exists func exists(path string) bool { _, err := os.Stat(path) return err == nil } // atoi, does not return error func atoi(a string) int64 { i, _ := strconv.ParseInt(a, 10, 64) return i } // return file content as string, no errors func httpfsRead(fname string) string { data, err := httpfs.Read(fname) if err != nil { return "" } return string(data) } 3-3.11.1/cmd/mumax3-server/main.go000066400000000000000000000123441503346766200165130ustar00rootroot00000000000000package main import ( "flag" "fmt" "log" "net" "net/http" _ "net/http/pprof" "os" "strconv" "strings" "sync" "time" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) var ( flag_addr = flag.String("l", ":35360", "Listen and serve at this network address") flag_scan = flag.String("scan", "192.168.0.1-128", "Scan these IP address for other servers") flag_ports = flag.String("ports", "35360-35361", "Scan these ports for other servers") flag_timeout = flag.Duration("timeout", 2*time.Second, "Portscan timeout") flag_mumax = flag.String("exec", "mumax3", "mumax3 executable") flag_cachedir = flag.String("cache", "", "mumax3 kernel cache path") flag_log = flag.Bool("log", true, "log debug output") flag_halflife = flag.Duration("halflife", 24*time.Hour, "share decay half-life") ) const ( MaxIPs = 1024 // maximum number of IP addresses to portscan N_SCANNERS = 32 // number of parallel portscan goroutines MAXGPU = 16 // maximum number of GPU's to check for KeepaliveInterval = 10 * time.Second // signal process liveness every KeepaliveInterval ) var ( thisAddr string // unique address of this node, e.g., name:1234 thisHost string // unique hostname of this node, e.g., name IPs []string MinPort, MaxPort int global_lock sync.RWMutex ) func RLock() { global_lock.RLock() } func RUnlock() { global_lock.RUnlock() } func WLock() { global_lock.Lock() } func WUnlock() { global_lock.Unlock() } const GUI_PORT = 35367 // base port number for GUI (to be incremented by GPU number) func main() { flag.Parse() IPs = parseIPs() MinPort, MaxPort = parsePorts() thisAddr = canonicalAddr(*flag_addr, IPs) var err error thisHost, _, err = net.SplitHostPort(thisAddr) util.FatalErr(err) DetectMumax() DetectGPUs() LoadJobs() http.HandleFunc("/do/", HandleRPC) http.HandleFunc("/", HandleStatus) httpfs.RegisterHandlers() // Listen and serve on all interfaces go func() { log.Println("serving at", thisAddr) // Resolve the IPs for thisHost thisIP, err := net.LookupHost(thisHost) Fatal(err) // try to listen and serve on all interfaces other than thisAddr // this is for convenience, errors are not fatal. _, p, err := net.SplitHostPort(thisAddr) Fatal(err) ips := util.InterfaceAddrs() for _, ip := range ips { addr := net.JoinHostPort(ip, p) if !contains(thisIP, ip) { // skip thisIP, will start later and is fatal on error go func() { log.Println("serving at", addr) err := http.ListenAndServe(addr, nil) if err != nil { log.Println("info:", err, "(but still serving other interfaces)") } }() } } // only on thisAddr, this server's unique address, // we HAVE to be listening. Fatal(http.ListenAndServe(thisAddr, nil)) }() ProbePeer(thisAddr) // make sure we have ourself as peer go FindPeers(IPs, MinPort, MaxPort) go RunComputeService() go LoopWatchdog() go RunShareDecay() // re-load jobs every hour so we don't stall on very exceptional circumstances go func() { for { time.Sleep(1 * time.Hour) LoadJobs() } }() <-make(chan struct{}) // wait forever } // replace laddr by a canonical form, as it will serve as unique ID func canonicalAddr(laddr string, IPs []string) string { // safe initial guess: hostname:port h, p, err := net.SplitHostPort(laddr) Fatal(err) if h == "" { h, _ = os.Hostname() } name := net.JoinHostPort(h, p) ips := util.InterfaceAddrs() for _, ip := range ips { if contains(IPs, ip) { return net.JoinHostPort(ip, p) } } return name } func contains(arr []string, x string) bool { for _, s := range arr { if x == s { return true } } return false } // Parse port range flag. E.g.: // // 1234-1237 -> 1234, 1237 func parsePorts() (minPort, maxPort int) { p := *flag_ports split := strings.Split(p, "-") if len(split) > 2 { log.Fatal("invalid port range:", p) } minPort, _ = strconv.Atoi(split[0]) if len(split) > 1 { maxPort, _ = strconv.Atoi(split[1]) } if maxPort == 0 { maxPort = minPort } if minPort == 0 || maxPort == 0 || maxPort < minPort { log.Fatal("invalid port range:", p) } return } // init IPs from flag func parseIPs() []string { var IPs []string defer func() { if err := recover(); err != nil { log.Fatal("invalid IP range:", *flag_scan) } }() p := *flag_scan split := strings.Split(p, ",") for _, s := range split { split := strings.Split(s, ".") if len(split) != 4 { log.Fatal("invalid IP address range:", s) } var start, stop [4]uint for i, s := range split { split := strings.Split(s, "-") first := atobyte(split[0]) start[i], stop[i] = first, first if len(split) > 1 { stop[i] = atobyte(split[1]) } } for A := start[0]; A <= stop[0]; A++ { for B := start[1]; B <= stop[1]; B++ { for C := start[2]; C <= stop[2]; C++ { for D := start[3]; D <= stop[3]; D++ { if len(IPs) > MaxIPs { log.Fatal("too many IP addresses to scan in", p) } IPs = append(IPs, fmt.Sprintf("%v.%v.%v.%v", A, B, C, D)) } } } } } return IPs } func atobyte(a string) uint { i, err := strconv.Atoi(a) if err != nil { panic(err) } if int(byte(i)) != i { panic("too large") } return uint(i) } 3-3.11.1/cmd/mumax3-server/peers.go000066400000000000000000000024411503346766200167020ustar00rootroot00000000000000package main // Peer management: // portscan for peers // ping peers import ( "fmt" "log" ) var ( peers = make(map[string]*Peer) ) type Peer struct { } func AddPeer(pAddr string) { WLock() defer WUnlock() if _, ok := peers[pAddr]; !ok { log.Println("add new peer:", pAddr) peers[pAddr] = NewPeer() } } func NewPeer() *Peer { return &Peer{} } // RPC-called func Ping(peerAddr string) string { WLock() defer WUnlock() // Somebody just called my status, // add him as a peer (if not yet so). if _, ok := peers[peerAddr]; !ok { peers[peerAddr] = NewPeer() } return thisAddr } // Ping peer at address, add to peers list if he responds and is not yet added func ProbePeer(addr string) { ret, _ := RPCCall(addr, "Ping", thisAddr) if ret != "" { AddPeer(ret) } } // Scan IPs and port range for peers that respond to Ping, // add them to peers list. func FindPeers(IPs []string, minPort, maxPort int) { //log.Println("Portscan start") scanners := make(chan func()) for i := 0; i < N_SCANNERS; i++ { go func() { for f := range scanners { f() } }() } for _, ip := range IPs { for port := minPort; port <= maxPort; port++ { addr := fmt.Sprint(ip, ":", port) scanners <- func() { ProbePeer(addr) } } } close(scanners) log.Println("-- portscan done") } 3-3.11.1/cmd/mumax3-server/que.go000066400000000000000000000064171503346766200163650ustar00rootroot00000000000000package main import ( "log" "math" "os" "path/filepath" "sort" "strings" "time" ) /* Queue service scans the working directory for job files. The working directory should contain per-user subdirectories. E.g.: arne/ bartel/ ... The in-memory representation is a cache and can be out-of-date at any point. The queue service decides which job to hand out to a node if asked so. */ var ( Users = make(map[string]*User) // maps user -> joblist ) // RPC-callable method: picks a job of the queue returns it // for the node to run it. func GiveJob(nodeAddr string) string { WLock() defer WUnlock() user := nextUser() if user == "" { return "" } Users[user].FairShare += 1 // 1 second penalty because a job has started return Users[user].giveJob(nodeAddr).ID } func AddFairShare(s string) string { username := BaseDir(s) share := atoi(s[len(username)+1:]) WLock() defer WUnlock() u := Users[username] if u == nil { return "no user " + username } log.Println("AddFairShare", username, share) u.FairShare += float64(share) return "" // ok } func nextUser() string { // search user with least share and jobs in queue leastShare := math.Inf(1) var bestUser string for n, u := range Users { if u.HasJob() && u.FairShare < leastShare { leastShare = u.FairShare bestUser = n } } return bestUser } // (Re-)load all jobs in the working directory. // Called upon program startup. func LoadJobs() { dir, err := os.Open(".") Fatal(err) subdirs, err2 := dir.Readdir(-1) Fatal(err2) for _, d := range subdirs { if d.IsDir() { LoadUserJobs(d.Name()) } } } // (Re-)load all jobs in the user's subdirectory. func LoadUserJobs(dir string) string { log.Println("LoadUserJobs", dir) var newJobs []*Job err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { if strings.HasSuffix(path, ".mx3") && !strings.HasPrefix(info.Name(), ".") { ID := thisAddr + "/" + path log.Println("addingJob", ID) job := &Job{ID: ID} job.Update() newJobs = append(newJobs, job) } return nil }) l := joblist(newJobs) sort.Sort(&l) Fatal(err) // TODO: recover? WLock() defer WUnlock() if _, ok := Users[dir]; !ok { Users[dir] = NewUser() } Users[dir].Jobs = newJobs Users[dir].nextPtr = 0 return "" } type joblist []*Job func (l *joblist) Len() int { return len(*l) } func (l *joblist) Less(i, j int) bool { return (*l)[i].ID < (*l)[j].ID } func (l *joblist) Swap(i, j int) { (*l)[i], (*l)[j] = (*l)[j], (*l)[i] } // RPC-callable function. Refreshes the in-memory cached info about this job. // Called, e.g., after a node has finished a job. func UpdateJob(jobURL string) string { WLock() defer WUnlock() j := JobByName(jobURL) if j == nil { log.Println("update", jobURL, ": no such job") return "" // empty conventionally means error } j.Update() return "updated " + jobURL // not used, but handy if called by Human. } // Periodically updates user's usedShare so they decay // exponentially according to flag_halflife func RunShareDecay() { halflife := *flag_halflife quantum := halflife / 100 // several updates per half-life gives smooth decay reduce := math.Pow(0.5, float64(quantum)/float64(halflife)) for { time.Sleep(quantum) WLock() for _, u := range Users { u.FairShare *= reduce } WUnlock() } } 3-3.11.1/cmd/mumax3-server/rpc.go000066400000000000000000000040601503346766200163470ustar00rootroot00000000000000package main import ( "fmt" "io" "log" "net/http" "strings" "time" ) type RPCFunc func(string) string var methods = map[string]RPCFunc{ "AddFairShare": AddFairShare, "GiveJob": GiveJob, "Kill": Kill, "LoadJobs": wrap(LoadJobs), "LoadUserJobs": LoadUserJobs, "Ping": Ping, "UpdateJob": UpdateJob, "Rescan": func(string) string { go FindPeers(IPs, MinPort, MaxPort); return "" }, "WhatsTheTime": WhatsTheTime, "WakeupWatchdog": WakeupWatchdog, "rm": Rm, } func wrap(f func()) RPCFunc { return func(string) string { f(); return "" } } func HandleRPC(w http.ResponseWriter, r *http.Request) { var ret string defer func() { //log.Println(" < call ", r.Host, r.URL.Path, "->", ret) if err := recover(); err != nil { log.Println("*** RPC panic: ", r.URL.Path, ":", err) http.Error(w, "Does not compute: "+r.URL.Path, http.StatusBadRequest) } }() request := r.URL.Path[len("/do/"):] slashPos := strings.Index(request, "/") method := request[:slashPos] arg := request[slashPos+1:] m, ok := methods[method] if !ok { log.Println("*** RPC no such method", r.URL.Path) http.Error(w, "Does not compute: "+method, http.StatusBadRequest) return } ret = m(arg) fmt.Fprint(w, ret) } // re-usable http client for making RPC calls var httpClient = http.Client{Timeout: 2 * time.Second} // make RPC call to method on node with given address. func RPCCall(addr, method, arg string) (ret string, err error) { //defer func() { log.Println(" > call ", addr, method, arg, "->", ret, err) }() //TODO: escape args? resp, err := httpClient.Get("http://" + addr + "/do/" + method + "/" + arg) if err != nil { //log.Println("*** RPC error: ", err) return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { log.Println("*** RPC error: ", resp.Status) return "", fmt.Errorf("http status %v", resp.Status) } if b, err := io.ReadAll(resp.Body); err != nil { log.Println("*** RPC read error: ", err) return "", err } else { return string(b), nil } } 3-3.11.1/cmd/mumax3-server/status.go000066400000000000000000000126271503346766200171160ustar00rootroot00000000000000package main // Serves human-readable status information over http. import ( "html/template" "net/http" "time" ) var ( templ = template.Must(template.New("status").Parse(templText)) upSince = time.Now() ) func HandleStatus(w http.ResponseWriter, r *http.Request) { RLock() defer RUnlock() if r.URL.Path != "/" { http.Error(w, "Does not compute", http.StatusNotFound) return } err := templ.Execute(w, &status{}) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } } type status struct{} // dummy type to define template methods on func (*status) IPRange() string { return *flag_scan + ": " + *flag_ports } func (*status) Ports() string { return *flag_ports } func (*status) ThisAddr() string { return thisAddr } func (*status) Uptime() time.Duration { return Since(time.Now(), upSince) } func (*status) MumaxVersion() string { return MumaxVersion } func (*status) GPUs() []string { return GPUs } func (*status) Processes() map[string]*Process { return Processes } func (*status) Users() map[string]*User { return Users } func (*status) NextUser() string { return nextUser() } func (*status) Peers() map[string]*Peer { return peers } func (*status) FS(a string) string { return FS(a) } const templText = ` {{define "Job"}} [{{.LocalPath}}] [{{with .Output}}.out{{end}}] [{{with .Output}}rm{{end}}] [{{with .Host}}{{.}}{{end}}] [{{with .ExitStatus}}{{if eq . "0"}} OK {{else}}FAIL{{end}}{{end}}] [{{with .Output}}{{$.Duration}}{{end}}{{with .RequeCount}} {{.}}x re-queued{{end}}{{with .Error}} {{.}}{{end}}] {{end}}

{{.ThisAddr}}

Uptime: {{.Uptime}}

Peer nodes

scan {{.IPRange}}
ports {{.Ports}}

{{range $k,$v := .Peers}} {{$k}}
{{end}}

Compute service

mumax: {{with .MumaxVersion}} {{.}} {{else}} not available
{{end}}
{{with .GPUs}} {{range $i, $v := .}} GPU{{$i}}: {{$v}}
{{end}} {{else}} No GPUs available
{{end}}

Running jobs

{{range $k,$v := .Processes}} {{end}}
[{{$k}}] [{{$v.Duration}}] [GUI]

Queue service

Users

{{range $k,$v := .Users}} {{end}}
{{$k}}{{$v.FairShare}} GPU-seconds{{with .HasJob}} has {{else}} no {{end}} queued jobs
Next job for: {{.NextUser}}

Jobs

(consider reloading just your own files).
(re-queue dead simulations right now). {{range $k,$v := .Users}}

▾ {{$k}}

Jobs (only needed when you changed your files on disk) {{range $v.Jobs}} {{template "Job" .}} {{end}}

{{end}}

` 3-3.11.1/cmd/mumax3-server/user.go000066400000000000000000000017151503346766200165450ustar00rootroot00000000000000package main import "time" type User struct { Jobs []*Job FairShare float64 // Used-up compute time in the past (decays) nextPtr int // pointer suggesting next job to start. Reset on re-scan. len(Jobs) means no queued job } func NewUser() *User { return &User{} } // nextJob looks for the next free job in the list. // it does a tiny bit of linear search, starting from nextPtr. func (u *User) giveJob(node string) *Job { index := u.nextJobPtr() if index >= len(u.Jobs) { return nil } u.nextPtr++ j := u.Jobs[index] // all below are preliminary, to get rapid gui response. // may be overwritten by update j.Host = node j.Output = OutputDir(j.ID) j.Start = time.Now() return j } func (u *User) HasJob() bool { i := u.nextJobPtr() return i < len(u.Jobs) } // returns func (u *User) nextJobPtr() int { for ; u.nextPtr < len(u.Jobs); u.nextPtr++ { j := u.Jobs[u.nextPtr] if j.IsQueued() { return u.nextPtr } } return u.nextPtr } 3-3.11.1/cmd/mumax3-server/utitl.go000066400000000000000000000015211503346766200167230ustar00rootroot00000000000000package main import ( "log" "net/url" "strings" "time" ) // BaseDir returns the first path element, without slashes and ignoring http:// . E.g.: // // /home/user/file -> home // user/file -> user // http://home/user/file -> home func BaseDir(dir string) string { if strings.HasPrefix(dir, "http://") { return BaseDir(dir[len("http://"):]) } firstSlash := strings.Index(dir, "/") switch { case firstSlash < 0: return dir case firstSlash == 0: return BaseDir(dir[1:]) default: return dir[:firstSlash] } } func Fatal(err error) { if err != nil { log.Fatal(err) } } // rounded up to 1s precission func Since(a, b time.Time) time.Duration { d := a.Sub(b) return (d/1e9)*1e9 + 1e9 } // Parse URL, panic on error func MustParseURL(URL string) *url.URL { u, err := url.Parse(URL) if err != nil { panic(err) } return u } 3-3.11.1/cmd/mumax3-server/watchdog.go000066400000000000000000000021521503346766200173630ustar00rootroot00000000000000package main import ( "log" "time" ) var runWatchdog = make(chan struct{}) func init() { // run watchdog daemon in background go func() { for { <-runWatchdog // wait for start DoWatchdog() } }() } func LoopWatchdog() { for { WakeupWatchdog("") time.Sleep(3 * KeepaliveInterval) } } func WakeupWatchdog(string) string { select { default: return "already running" case runWatchdog <- struct{}{}: return "" // ok } } // single watchdog run: // re-queues all dead processes func DoWatchdog() { //log.Println("Watchdog wake-up") WLock() defer WUnlock() for _, u := range Users { for _, j := range u.Jobs { id := j.ID //log.Println(id, "running:", j.IsRunning(), "alive:", time.Since(j.Alive)) if j.IsRunning() && time.Since(j.Alive) > 3*KeepaliveInterval { j.Update() lastHeartbeat := time.Since(j.Alive) if lastHeartbeat > 3*KeepaliveInterval { log.Println("*** Re-queue", id, "after", lastHeartbeat, "inactivity") j.Reque() } } } // re-set nextPtr to beginning so we can start re-queued jobs if u.nextPtr >= len(u.Jobs) { u.nextPtr = 0 } } } 3-3.11.1/cmd/mumax3/000077500000000000000000000000001503346766200137305ustar00rootroot000000000000003-3.11.1/cmd/mumax3/.gitignore000066400000000000000000000000071503346766200157150ustar00rootroot00000000000000mumax3 3-3.11.1/cmd/mumax3/Makefile000066400000000000000000000004211503346766200153650ustar00rootroot00000000000000all: @COMMIT_HASH=$(shell git rev-parse --short HEAD 2>/dev/null || echo "unknown"); \ if [ "$$COMMIT_HASH" = "unknown" ]; then \ echo "Warning: Could not determine Git commit hash. Using 'unknown'."; \ fi; \ go install -ldflags "-X main.commitHash=$$COMMIT_HASH" -v 3-3.11.1/cmd/mumax3/browser.go000066400000000000000000000007401503346766200157430ustar00rootroot00000000000000package main import ( "fmt" "os/exec" ) // Try to open url in a browser. Instruct to do so if it fails. func openbrowser(url string) { for _, cmd := range browsers { err := exec.Command(cmd, url).Start() if err == nil { fmt.Println("//opened web interface in", cmd) return } } fmt.Println("//please open ", url, " in a browser") } // list of browsers to try. var browsers = []string{"x-www-browser", "google-chrome", "chromium-browser", "firefox", "explorer"} 3-3.11.1/cmd/mumax3/main.go000066400000000000000000000164541503346766200152150ustar00rootroot00000000000000// mumax3 main command package main import ( "bufio" "bytes" "flag" "fmt" "log" "os" "os/exec" "path" "runtime" "strings" "time" "github.com/mumax/3/cuda" "github.com/mumax/3/engine" "github.com/mumax/3/script" "github.com/mumax/3/util" ) var ( flag_failfast = flag.Bool("failfast", false, "If one simulation fails, stop entire batch immediately") flag_test = flag.Bool("test", false, "Cuda test (internal)") flag_version = flag.Bool("v", true, "Print version") flag_vet = flag.Bool("vet", false, "Check input files for errors, but don't run them") // more flags in engine/gofiles.go commitHash string ) func main() { flag.Parse() log.SetPrefix("") log.SetFlags(0) cuda.Init(*engine.Flag_gpu) cuda.Synchronous = *engine.Flag_sync if *flag_version { printVersion() } // used by bootstrap launcher to test cuda // successful exit means cuda was initialized fine if *flag_test { os.Exit(0) } defer engine.Close() // flushes pending output, if any if *flag_vet { vet() return } switch flag.NArg() { case 0: if *engine.Flag_interactive { runInteractive() } case 1: runFileAndServe(flag.Arg(0)) default: RunQueue(flag.Args()) } } func runInteractive() { fmt.Println("//no input files: starting interactive session") //initEngine() // setup output dir now := time.Now() outdir := fmt.Sprintf("mumax-%v-%02d-%02d_%02dh%02d.out", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute()) engine.InitIO(outdir, outdir, *engine.Flag_forceclean) engine.Timeout = 365 * 24 * time.Hour // basically forever // set up some sensible start configuration engine.Eval(`SetGridSize(128, 64, 1) SetCellSize(4e-9, 4e-9, 4e-9) Msat = 1e6 Aex = 10e-12 alpha = 1 m = RandomMag()`) addr := goServeGUI() openbrowser("http://127.0.0.1" + addr) engine.RunInteractive() } func runFileAndServe(fname string) { if path.Ext(fname) == ".go" { runGoFile(fname) } else { runScript(fname) } } func runScript(fname string) { outDir := util.NoExt(fname) + ".out" if *engine.Flag_od != "" { outDir = *engine.Flag_od } engine.InitIO(fname, outDir, *engine.Flag_forceclean) fname = engine.InputFile var code *script.BlockStmt var err2 error if fname != "" { // first we compile the entire file into an executable tree code, err2 = engine.CompileFile(fname) util.FatalErr(err2) } // now the parser is not used anymore so it can handle web requests addr := goServeGUI() if *engine.Flag_interactive { openbrowser("http://127.0.0.1" + addr) } // start executing the tree, possibly injecting commands from web gui engine.EvalFile(code) if *engine.Flag_interactive { engine.RunInteractive() } } func runGoFile(fname string) { // pass through flags flags := []string{"run", fname} flag.Visit(func(f *flag.Flag) { if f.Name != "o" { flags = append(flags, fmt.Sprintf("-%v=%v", f.Name, f.Value)) } }) if *engine.Flag_od != "" { flags = append(flags, fmt.Sprintf("-o=%v", *engine.Flag_od)) } cmd := exec.Command("go", flags...) log.Println("go", flags) cmd.Stdout = os.Stdout cmd.Stdin = os.Stdin cmd.Stderr = os.Stderr err := cmd.Run() if err != nil { os.Exit(1) } } // start GUI server and return server address func goServeGUI() string { if *engine.Flag_port == "" { log.Println(`//not starting GUI (-http="")`) return "" } addr := engine.GoServe(*engine.Flag_port) fmt.Print("//starting GUI at http://127.0.0.1", addr, "\n") return addr } // print version to stdout func printVersion() { engine.LogOut(engine.UNAME) engine.LogOut(fmt.Sprintf("commit hash: %s", commitHash)) engine.LogOut(getCPUInfo()) engine.LogOut(fmt.Sprintf("GPU info: %s, using cc=%d PTX", cuda.GPUInfo, cuda.UseCC)) osInfo := fmt.Sprintf("OS info: %s, Hostname: %s", getOSInfo(), getHostname()) engine.LogOut(osInfo) engine.LogOut(fmt.Sprintf("Timestamp: %s", time.Now().Format("2006-01-02 15:04:05"))) engine.LogOut("(c) Arne Vansteenkiste, Dynamat LAB, Ghent University, Belgium") engine.LogOut("This is free software without any warranty. See license.txt") engine.LogOut("********************************************************************//") engine.LogOut(" If you use mumax in any work or publication, //") engine.LogOut(" we kindly ask you to cite the references in references.bib //") engine.LogOut("********************************************************************//") } func getHostname() string { hostname, err := os.Hostname() if err != nil { return "Unknown" } return hostname } func getOSInfo() string { // Check the runtime operating system switch runtime.GOOS { case "windows": return "Windows OS" case "linux": return getLinuxOSInfo() // Add more cases for other operating systems if needed default: return fmt.Sprintf("Unknown OS: %s", runtime.GOOS) } } func getLinuxOSInfo() string { // Check if the file exists file, err := os.Open("/etc/os-release") if err != nil { return fmt.Sprintf("Unknown OS, Error: %s", err.Error()) } defer file.Close() // Scan the file line by line scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() parts := strings.SplitN(line, "=", 2) if len(parts) == 2 && parts[0] == "PRETTY_NAME" { // Remove surrounding quotes and return return strings.Trim(parts[1], `"`) } } if err := scanner.Err(); err != nil { return fmt.Sprintf("Unknown OS, Error: %s", err.Error()) } return "Unknown OS" } func getCPUInfo() string { // Check the runtime operating system switch runtime.GOOS { case "windows": return getWindowsCPUInfo() case "linux": return getLinuxCPUInfo() // Add more cases for other operating systems if needed default: return fmt.Sprintf("CPU info: Unknown OS: %s", runtime.GOOS) } } func getWindowsCPUInfo() string { // Get CPU model name cmd := exec.Command("wmic", "cpu", "get", "Name") var out bytes.Buffer cmd.Stdout = &out if err := cmd.Run(); err != nil { return fmt.Sprintf("CPU info: Unknown, Error: %s", err.Error()) } output := strings.Split(out.String(), "\n") cpuModel := "Unknown model" if len(output) > 1 { cpuModel = strings.TrimSpace(output[1]) } // Get CPU number of cores cpuCores := runtime.NumCPU() // Get CPU speed cmd = exec.Command("wmic", "cpu", "get", "MaxClockSpeed") out.Reset() cmd.Stdout = &out cpuMHz := "Unknown clock frequency" if err := cmd.Run(); err == nil { output = strings.Split(out.String(), "\n") if len(output) > 1 { cpuMHz = strings.TrimSpace(output[1]) + " MHz" } } return fmt.Sprintf("CPU info: %s, Cores: %d, %s", cpuModel, cpuCores, cpuMHz) } func getLinuxCPUInfo() string { file, err := os.Open("/proc/cpuinfo") if err != nil { return fmt.Sprintf("CPU info: Unknown, Error: %s", err.Error()) } defer file.Close() scanner := bufio.NewScanner(file) var cpuDetails []string var cpuModel, cpuCores, cpuMHz string for scanner.Scan() { line := scanner.Text() fields := strings.Split(line, ":") if len(fields) != 2 { continue } key := strings.TrimSpace(fields[0]) value := strings.TrimSpace(fields[1]) switch key { case "model name": cpuModel = value case "cpu cores": cpuCores = value case "cpu MHz": cpuMHz = value } } if cpuModel != "" && cpuCores != "" && cpuMHz != "" { cpuDetails = append(cpuDetails, fmt.Sprintf("CPU info: %s, Cores: %s, MHz: %s", cpuModel, cpuCores, cpuMHz)) } return strings.Join(cpuDetails, "; ") } 3-3.11.1/cmd/mumax3/mumax3.sh000077500000000000000000000004561503346766200155060ustar00rootroot00000000000000#! /bin/bash # # This script adds the current directory to your library path # and launches mumax3 using the shipped cuda libraries. # # When you have correctly set-up cuda, you can just run # mumax directly without this wrapper. # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd) ./mumax3-cuda6.0 $@ 3-3.11.1/cmd/mumax3/queue.go000066400000000000000000000111701503346766200154030ustar00rootroot00000000000000package main // File queue for distributing multiple input files over GPUs. import ( "flag" "fmt" "io" "log" "net/http" "os" "os/exec" "sync" "sync/atomic" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/engine" ) var ( exitStatus atom = 0 numOK, numFailed atom = 0, 0 ) func RunQueue(files []string) { s := NewStateTab(files) s.PrintTo(os.Stdout) go s.ListenAndServe(*engine.Flag_port) fmt.Print("//Realtime queue overview available at http://127.0.0.1", *engine.Flag_port, "\n") s.Run() fmt.Println(numOK.get(), "OK, ", numFailed.get(), "failed") os.Exit(int(exitStatus)) } // StateTab holds the queue state (list of jobs + statuses). // All operations are atomic. type stateTab struct { lock sync.Mutex jobs []job next int } // Job info. type job struct { inFile string // input file to run webAddr string // http address for gui of running process uid int } // NewStateTab constructs a queue for the given input files. // After construction, it is accessed atomically. func NewStateTab(inFiles []string) *stateTab { s := new(stateTab) s.jobs = make([]job, len(inFiles)) for i, f := range inFiles { s.jobs[i] = job{inFile: f, uid: i} } return s } // StartNext advances the next job and marks it running, setting its webAddr to indicate the GUI url. // A copy of the job info is returned, the original remains unmodified. // ok is false if there is no next job. func (s *stateTab) StartNext(webAddr string) (next job, ok bool) { s.lock.Lock() defer s.lock.Unlock() if s.next >= len(s.jobs) { return job{}, false } s.jobs[s.next].webAddr = webAddr jobCopy := s.jobs[s.next] s.next++ return jobCopy, true } // Finish marks the job with j's uid as finished. func (s *stateTab) Finish(j job) { s.lock.Lock() defer s.lock.Unlock() s.jobs[j.uid].webAddr = "" } // Runs all the jobs in stateTab. func (s *stateTab) Run() { idle, nGPU := initGPUs() for { gpu := <-idle addr := fmt.Sprint(":", 35368+gpu) j, ok := s.StartNext(addr) if !ok { break } go func() { run(j.inFile, gpu, j.webAddr) s.Finish(j) idle <- gpu }() } // drain remaining tasks (one already done) for i := 1; i < nGPU; i++ { <-idle } } type atom int32 func (a *atom) set(v int) { atomic.StoreInt32((*int32)(a), int32(v)) } func (a *atom) get() int { return int(atomic.LoadInt32((*int32)(a))) } func (a *atom) inc() { atomic.AddInt32((*int32)(a), 1) } func run(inFile string, gpu int, webAddr string) { // overridden flags gpuFlag := fmt.Sprint(`-gpu=`, gpu) httpFlag := fmt.Sprint(`-http=`, webAddr) // pass through flags flags := []string{gpuFlag, httpFlag} flag.Visit(func(f *flag.Flag) { if f.Name != "gpu" && f.Name != "http" && f.Name != "failfast" { flags = append(flags, fmt.Sprintf("-%v=%v", f.Name, f.Value)) } }) flags = append(flags, inFile) cmd := exec.Command(os.Args[0], flags...) log.Println(os.Args[0], flags) output, err := cmd.CombinedOutput() if err != nil { log.Println(inFile, err) log.Printf("%s\n", output) exitStatus.set(1) numFailed.inc() if *flag_failfast { os.Exit(1) } } else { numOK.inc() } } // Creates a concurrent channel containing the available GPU IDs for jobs. // Returns the channel and the number of available GPUs for the queue. func initGPUs() (chan int, int) { nGpu := cu.DeviceGetCount() if nGpu == 0 { log.Fatal("no GPUs available") } singleGPU := engine.FlagPassed("gpu") if singleGPU { nGpu = 1 } idle := make(chan int, nGpu) if singleGPU { idle <- *engine.Flag_gpu } else { for i := 0; i < nGpu; i++ { idle <- i } } return idle, nGpu } func (s *stateTab) PrintTo(w io.Writer) { s.lock.Lock() defer s.lock.Unlock() for i, j := range s.jobs { fmt.Fprintf(w, "%3d %v %v\n", i, j.inFile, j.webAddr) } } func (s *stateTab) RenderHTML(w io.Writer) { s.lock.Lock() defer s.lock.Unlock() fmt.Fprintln(w, ` `+engine.CSS+` mumax3 queue status

`)

	hostname := "localhost"
	hostname, _ = os.Hostname()
	for _, j := range s.jobs {
		if j.webAddr != "" {
			fmt.Fprint(w, ``, j.uid, ` `, j.inFile, " ", j.webAddr, "\n")
		} else {
			fmt.Fprint(w, j.uid, " ", j.inFile, "\n")
		}
	}

	fmt.Fprintln(w, `

`) } func (s *stateTab) ListenAndServe(addr string) { http.Handle("/", s) go http.ListenAndServe(addr, nil) } func (s *stateTab) ServeHTTP(w http.ResponseWriter, r *http.Request) { s.RenderHTML(w) } 3-3.11.1/cmd/mumax3/vet.go000066400000000000000000000010341503346766200150530ustar00rootroot00000000000000package main import ( "flag" "fmt" "os" "github.com/mumax/3/engine" "github.com/mumax/3/util" ) // check all input files for errors, don't run. func vet() { status := 0 for _, f := range flag.Args() { src, ioerr := os.ReadFile(f) util.FatalErr(ioerr) engine.World.EnterScope() // avoid name collisions between separate files _, err := engine.World.Compile(string(src)) engine.World.ExitScope() if err != nil { fmt.Println(f, ":", err) status = 1 } else { fmt.Println(f, ":", "OK") } } os.Exit(status) } 3-3.11.1/cuda/000077500000000000000000000000001503346766200126675ustar00rootroot000000000000003-3.11.1/cuda/.gitignore000066400000000000000000000000411503346766200146520ustar00rootroot00000000000000*.ptx cuda2go cuda2go.exe *.orig 3-3.11.1/cuda/Makefile000066400000000000000000000054521503346766200143350ustar00rootroot00000000000000# Builds mumax3 cuda kernels and create GO wrappers for the compute capabilities listed in $CUDA_CC. # If $CUDA_CC is not defined, then $CUDA_CC is set to "50". # # The ${CUDA_HOME}/bin/nvcc compiler is used to compile the cuda kernels. If CUDA_HOME is not defined # it will look for an nvidia compiler in $PATH instead. # # Examples: # # make # make CUDA_CC=70 # make CUDA_CC="50 52 53 60 61 62 70 72 75 80 86" # make CUDA_HOME="/usr/local/cuda-12.6" CUDA_CC="50 52 53 60 61 62 70 72 75 80 86" # # Different CUDA versions support different compute capabilities, as shown in the list below. See https://stackoverflow.com/a/28933055. # CUDA SDK 10.0 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 75 # CUDA SDK 10.1 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 75 # CUDA SDK 10.2 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 75 # CUDA SDK 11.0 support for compute capability 35 37 50 52 53 60 61 62 70 72 75 80 # CUDA SDK 11.1-11.7 support for compute capability 35 37 50 52 53 60 61 62 70 72 75 80 86 # CUDA SDK 11.8 support for compute capability 35 37 50 52 53 60 61 62 70 72 75 80 86 87 89 # CUDA SDK 12.0+ support for compute capability 50 52 53 60 61 62 70 72 75 80 86 87 89 90 SHELL = /bin/bash # When CUDA_HOME is not an environment variable and is not set on the command line, use the nvcc compiler # from the PATH ifeq ($(CUDA_HOME),) NVCC=nvcc else NVCC=${CUDA_HOME}/bin/nvcc endif # When CUDA_CC is not an environment variable and is not set on the command line, use compute capability 3.0 ifeq ($(CUDA_CC),) CUDA_CC = 50 # Lowest supported CC for mumax3.11 endif # The gcc host compiler for nvcc ifeq ($(NVCC_CCBIN),) override NVCC_CCBIN:=/usr/bin/gcc endif CUDA_VERSION := $(shell $(NVCC) --version | grep "Cuda compilation" | grep -Eo '[+-]?[0-9]+([.][0-9]+)?' | head -n 1) NVCC_COMPATIBILITY_FLAGS := -std=c++03 ifneq (,$(filter 7.0 7.5 8.0,$(CUDA_VERSION))) NVCC_COMPATIBILITY_FLAGS := endif NVCCFLAGS = $(NVCC_COMPATIBILITY_FLAGS) -ccbin=$(NVCC_CCBIN) --compiler-options -Werror --compiler-options -Wall -Xptxas -O3 -ptx CUDAFILES := $(wildcard *.cu) WRAPPERS := $(CUDAFILES:.cu=_wrapper.go) .PHONY: all wrappers clean realclean all: wrappers @echo "Built with CUDA version ${CUDA_VERSION}" go install -v wrappers: $(WRAPPERS) %_wrapper.go: %.cu cuda2go @ rm -f $(basename $<)*.ptx @ for cc in $(CUDA_CC); do \ echo $(NVCC) $(NVCCFLAGS) -arch=compute_$$cc -code=sm_$$cc $< -o $(basename $<)_$$cc.ptx ;\ $(NVCC) $(NVCCFLAGS) -arch=compute_$$cc -code=sm_$$cc $< -o $(basename $<)_$$cc.ptx ;\ done @ ./cuda2go $< > /dev/null @ gofmt -w -s -l $@ > /dev/null cuda2go: cuda2go.go go build $< clean: rm -vf *.ptx realclean: rm -vf *_wrapper.go *.ptx cuda2go3-3.11.1/cuda/alloc.go000066400000000000000000000010421503346766200143050ustar00rootroot00000000000000package cuda import ( "log" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" ) // Wrapper for cu.MemAlloc, fatal exit on out of memory. func MemAlloc(bytes int64) unsafe.Pointer { defer func() { err := recover() if err == cu.ERROR_OUT_OF_MEMORY { log.Fatal(err) } if err != nil { panic(err) } }() return unsafe.Pointer(uintptr(cu.MemAlloc(bytes))) } // Returns a copy of in, allocated on GPU. func GPUCopy(in *data.Slice) *data.Slice { s := NewSlice(in.NComp(), in.Size()) data.Copy(s, in) return s } 3-3.11.1/cuda/amul.h000066400000000000000000000013701503346766200137770ustar00rootroot00000000000000#ifndef _AMUL_H_ #define _AMUL_H_ #include "float3.h" // Returns mul * arr[i], or mul when arr == NULL; inline __device__ float amul(float *arr, float mul, int i) { return (arr == NULL)? (mul): (mul * arr[i]); } // Returns m * a[i], or m when a == NULL; inline __device__ float3 vmul(float *ax, float *ay, float *az, float mx, float my, float mz, int i) { return make_float3(amul(ax, mx, i), amul(ay, my, i), amul(az, mz, i)); } // Returns 1/Msat, or 0 when Msat == 0. inline __device__ float inv_Msat(float *Ms_, float Ms_mul, int i) { float ms = amul(Ms_, Ms_mul, i); if (ms == 0.0f) { return 0.0f; } else { return 1.0f / ms; } } #endif 3-3.11.1/cuda/angles.go000066400000000000000000000007061503346766200144720ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) func SetPhi(s *data.Slice, m *data.Slice) { N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) k_setPhi_async(s.DevPtr(X), m.DevPtr(X), m.DevPtr(Y), N[X], N[Y], N[Z], cfg) } func SetTheta(s *data.Slice, m *data.Slice) { N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) k_setTheta_async(s.DevPtr(X), m.DevPtr(Z), N[X], N[Y], N[Z], cfg) } 3-3.11.1/cuda/anisotropy.go000066400000000000000000000024431503346766200154300ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add cubic magnetocrystalline anisotropy field to Beff. // see cubicanisotropy2.cu func AddCubicAnisotropy2(Beff, m *data.Slice, Msat, k1, k2, k3, c1, c2 MSlice) { util.Argument(Beff.Size() == m.Size()) N := Beff.Len() cfg := make1DConf(N) k_addcubicanisotropy2_async( Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), k1.DevPtr(0), k1.Mul(0), k2.DevPtr(0), k2.Mul(0), k3.DevPtr(0), k3.Mul(0), c1.DevPtr(X), c1.Mul(X), c1.DevPtr(Y), c1.Mul(Y), c1.DevPtr(Z), c1.Mul(Z), c2.DevPtr(X), c2.Mul(X), c2.DevPtr(Y), c2.Mul(Y), c2.DevPtr(Z), c2.Mul(Z), N, cfg) } // Add uniaxial magnetocrystalline anisotropy field to Beff. // see uniaxialanisotropy2.cu func AddUniaxialAnisotropy2(Beff, m *data.Slice, Msat, k1, k2, u MSlice) { util.Argument(Beff.Size() == m.Size()) checkSize(Beff, m, k1, k2, u, Msat) N := Beff.Len() cfg := make1DConf(N) k_adduniaxialanisotropy2_async( Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), k1.DevPtr(0), k1.Mul(0), k2.DevPtr(0), k2.Mul(0), u.DevPtr(X), u.Mul(X), u.DevPtr(Y), u.Mul(Y), u.DevPtr(Z), u.Mul(Z), N, cfg) } 3-3.11.1/cuda/atomicf.h000066400000000000000000000002741503346766200144650ustar00rootroot00000000000000#ifndef _ATOMICF_H_ #define _ATOMICF_H_ // Atomic max of abs value. inline __device__ void atomicFmaxabs(float* a, float b){ b = fabs(b); atomicMax((int*)(a), *((int*)(&b))); } #endif 3-3.11.1/cuda/buffer.go000066400000000000000000000044171503346766200144750ustar00rootroot00000000000000package cuda // Pool of re-usable GPU buffers. // Synchronization subtlety: // async kernel launches mean a buffer may already be recycled when still in use. // That should be fine since the next launch runs in the same stream (0), and will // effectively wait for the previous operation on the buffer. import ( "log" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" ) var ( buf_pool = make(map[int][]unsafe.Pointer) // pool of GPU buffers indexed by size buf_check = make(map[unsafe.Pointer]struct{}) // checks if pointer originates here to avoid unintended recycle ) const buf_max = 100 // maximum number of buffers to allocate (detect memory leak early) // Returns a GPU slice for temporary use. To be returned to the pool with Recycle func Buffer(nComp int, size [3]int) *data.Slice { if Synchronous { Sync() } ptrs := make([]unsafe.Pointer, nComp) // re-use as many buffers as possible form our stack N := prod(size) pool := buf_pool[N] nFromPool := iMin(nComp, len(pool)) for i := 0; i < nFromPool; i++ { ptrs[i] = pool[len(pool)-i-1] } buf_pool[N] = pool[:len(pool)-nFromPool] // allocate as much new memory as needed for i := nFromPool; i < nComp; i++ { if len(buf_check) >= buf_max { log.Panic("too many buffers in use, possible memory leak") } ptrs[i] = MemAlloc(int64(cu.SIZEOF_FLOAT32 * N)) buf_check[ptrs[i]] = struct{}{} // mark this pointer as mine } return data.SliceFromPtrs(size, data.GPUMemory, ptrs) } // Returns a buffer obtained from GetBuffer to the pool. func Recycle(s *data.Slice) { if Synchronous { Sync() } N := s.Len() pool := buf_pool[N] // put each component buffer back on the stack for i := 0; i < s.NComp(); i++ { ptr := s.DevPtr(i) if ptr == unsafe.Pointer(uintptr(0)) { continue } if _, ok := buf_check[ptr]; !ok { log.Panic("recyle: was not obtained with getbuffer") } pool = append(pool, ptr) } s.Disable() // make it unusable, protect against accidental use after recycle buf_pool[N] = pool } // Frees all buffers. Called after mesh resize. func FreeBuffers() { Sync() for _, size := range buf_pool { for i := range size { cu.DevicePtr(uintptr(size[i])).Free() size[i] = nil } } buf_pool = make(map[int][]unsafe.Pointer) buf_check = make(map[unsafe.Pointer]struct{}) } 3-3.11.1/cuda/buffer_test.go000066400000000000000000000010561503346766200155300ustar00rootroot00000000000000package cuda import "testing" // In case of memory leak, this will crash func TestBuffer(t *testing.T) { m1 := [3]int{2, 1024, 2048} m2 := [3]int{4, 1024, 2048} a := Buffer(3, m1) b := Buffer(3, m2) c := Buffer(1, m1) d := Buffer(2, m2) Recycle(a) Recycle(b) Recycle(c) Recycle(d) for i := 0; i < 10000; i++ { b := Buffer(3, m2) Recycle(b) } } func BenchmarkBuffer(b *testing.B) { b.StopTimer() m := [3]int{2, 1024, 2048} a := Buffer(3, m) Recycle(a) b.StartTimer() for i := 0; i < b.N; i++ { a := Buffer(3, m) Recycle(a) } } 3-3.11.1/cuda/bytes.go000066400000000000000000000034061503346766200143470ustar00rootroot00000000000000package cuda // This file provides GPU byte slices, used to store regions. import ( "log" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/util" ) // 3D byte slice, used for region lookup. type Bytes struct { Ptr unsafe.Pointer Len int } // Construct new byte slice with given length, // initialised to zeros. func NewBytes(Len int) *Bytes { ptr := cu.MemAlloc(int64(Len)) cu.MemsetD8(cu.DevicePtr(ptr), 0, int64(Len)) return &Bytes{unsafe.Pointer(uintptr(ptr)), Len} } // Upload src (host) to dst (gpu). func (dst *Bytes) Upload(src []byte) { util.Argument(dst.Len == len(src)) MemCpyHtoD(dst.Ptr, unsafe.Pointer(&src[0]), int64(dst.Len)) } // Copy on device: dst = src. func (dst *Bytes) Copy(src *Bytes) { util.Argument(dst.Len == src.Len) MemCpy(dst.Ptr, src.Ptr, int64(dst.Len)) } // Copy to host: dst = src. func (src *Bytes) Download(dst []byte) { util.Argument(src.Len == len(dst)) MemCpyDtoH(unsafe.Pointer(&dst[0]), src.Ptr, int64(src.Len)) } // Set one element to value. // data.Index can be used to find the index for x,y,z. func (dst *Bytes) Set(index int, value byte) { if index < 0 || index >= dst.Len { log.Panic("Bytes.Set: index out of range:", index) } src := value MemCpyHtoD(unsafe.Pointer(uintptr(dst.Ptr)+uintptr(index)), unsafe.Pointer(&src), 1) } // Get one element. // data.Index can be used to find the index for x,y,z. func (src *Bytes) Get(index int) byte { if index < 0 || index >= src.Len { log.Panic("Bytes.Set: index out of range:", index) } var dst byte MemCpyDtoH(unsafe.Pointer(&dst), unsafe.Pointer(uintptr(src.Ptr)+uintptr(index)), 1) return dst } // Frees the GPU memory and disables the slice. func (b *Bytes) Free() { if b.Ptr != nil { cu.MemFree(cu.DevicePtr(uintptr(b.Ptr))) } b.Ptr = nil b.Len = 0 } 3-3.11.1/cuda/constants.h000066400000000000000000000006061503346766200150560ustar00rootroot00000000000000#ifndef _CONSTANTS_H_ #define _CONSTANTS_H_ #define PI 3.1415926535897932384626433 #define MU0 (4*PI*1e-7) // Permeability of vacuum in Tm/A #define QE 1.60217646E-19 // Electron charge in C #define MUB 9.2740091523E-24 // Bohr magneton in J/T #define GAMMA0 1.7595e11 // Gyromagnetic ratio of electron, in rad/Ts #define HBAR 1.05457173E-34 #endif 3-3.11.1/cuda/conv_common.go000066400000000000000000000032311503346766200155320ustar00rootroot00000000000000package cuda // common code for all convolutions. import ( "log" "math" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Output size of R2C FFT with given logic size, expressed in floats. func fftR2COutputSizeFloats(logicSize [3]int) [3]int { return [3]int{2 * (logicSize[X]/2 + 1), logicSize[Y], logicSize[Z]} } // product of elements func prod(size [3]int) int { return size[X] * size[Y] * size[Z] } // Extract real parts, copy them from src to dst. // In the meanwhile, check if imaginary parts are nearly zero // and scale the kernel to compensate for unnormalized FFTs. // scale = 1/N, with N the FFT logical size. func scaleRealParts(dst, src *data.Slice, scale float32) { util.Argument(2*dst.Len() == src.Len()) util.Argument(dst.NComp() == 1 && src.NComp() == 1) srcList := src.Host()[0] dstList := dst.Host()[0] // Normally, the FFT'ed kernel is purely real because of symmetry, // so we only store the real parts... maximg := float32(0.) for i := 0; i < src.Len()/2; i++ { dstList[i] = srcList[2*i] * scale if fabs(srcList[2*i+1]) > maximg { maximg = fabs(srcList[2*i+1]) } } maximg *= float32(math.Sqrt(float64(scale))) // after 1 FFT, normalization is sqrt(N) // ...however, we check that the imaginary parts are nearly zero, // just to be sure we did not make a mistake during kernel creation. if maximg > FFT_IMAG_TOLERANCE { log.Fatalf("FFT kernel imaginary part: %v\n", maximg) } } // Maximum tolerable imaginary/real part for demag kernel in Fourier space. Assures kernel has correct symmetry. const FFT_IMAG_TOLERANCE = 1e-6 // float32 absolute value func fabs(x float32) float32 { if x < 0 { return -x } return x } 3-3.11.1/cuda/conv_copypad.go000066400000000000000000000022221503346766200157000ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Copies src (larger) into dst (smaller). // Used to extract demag field after convolution on padded m. func copyUnPad(dst, src *data.Slice, dstsize, srcsize [3]int) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Argument(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize)) cfg := make3DConf(dstsize) k_copyunpad_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], cfg) } // Copies src into dst, which is larger, and multiplies by vol*Bsat. // The remainder of dst is not filled with zeros. // Used to zero-pad magnetization before convolution and in the meanwhile multiply m by its length. func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Msat MSlice) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize)) cfg := make3DConf(srcsize) k_copypadmul2_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], Msat.DevPtr(0), Msat.Mul(0), vol.DevPtr(0), cfg) } 3-3.11.1/cuda/conv_demag.go000066400000000000000000000137411503346766200153260ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Stores the necessary state to perform FFT-accelerated convolution // with magnetostatic kernel (or other kernel of same symmetry). type DemagConvolution struct { inputSize [3]int // 3D size of the input/output data realKernSize [3]int // Size of kernel and logical FFT size. fftKernLogicSize [3]int // logic size FFTed kernel, real parts only, we store less fftRBuf [3]*data.Slice // FFT input buf; 2D: Z shares storage with X. fftCBuf [3]*data.Slice // FFT output buf; 2D: Z shares storage with X. kern [3][3]*data.Slice // FFT kernel on device fwPlan fft3DR2CPlan // Forward FFT (1 component) bwPlan fft3DC2RPlan // Backward FFT (1 component) } // Initializes a convolution to evaluate the demag field for the given mesh geometry. // Sanity-checked if test == true (slow-ish for large meshes). func NewDemag(inputSize, PBC [3]int, kernel [3][3]*data.Slice, test bool) *DemagConvolution { c := new(DemagConvolution) c.inputSize = inputSize c.realKernSize = kernel[X][X].Size() c.init(kernel) if test { testConvolution(c, PBC, kernel) } return c } // Calculate the demag field of m * vol * Msat, store result in B. // // m: magnetization normalized to unit length // vol: unitless mask used to scale m's length, may be nil // Msat: saturation magnetization in A/m // B: resulting demag field, in Tesla func (c *DemagConvolution) Exec(B, m, vol *data.Slice, Msat MSlice) { util.Argument(B.Size() == c.inputSize && m.Size() == c.inputSize) if c.is2D() { c.exec2D(B, m, vol, Msat) } else { c.exec3D(B, m, vol, Msat) } } func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Msat MSlice) { for i := 0; i < 3; i++ { // FW FFT c.fwFFT(i, inp, vol, Msat) } // kern mul kernMulRSymm3D_async(c.fftCBuf, c.kern[X][X], c.kern[Y][Y], c.kern[Z][Z], c.kern[Y][Z], c.kern[X][Z], c.kern[X][Y], c.fftKernLogicSize[X], c.fftKernLogicSize[Y], c.fftKernLogicSize[Z]) for i := 0; i < 3; i++ { // BW FFT c.bwFFT(i, outp) } } func (c *DemagConvolution) exec2D(outp, inp, vol *data.Slice, Msat MSlice) { // Convolution is separated into // a 1D convolution for z and a 2D convolution for xy. // So only 2 FFT buffers are needed at the same time. Nx, Ny := c.fftKernLogicSize[X], c.fftKernLogicSize[Y] // Z c.fwFFT(Z, inp, vol, Msat) kernMulRSymm2Dz_async(c.fftCBuf[Z], c.kern[Z][Z], Nx, Ny) c.bwFFT(Z, outp) // XY c.fwFFT(X, inp, vol, Msat) c.fwFFT(Y, inp, vol, Msat) kernMulRSymm2Dxy_async(c.fftCBuf[X], c.fftCBuf[Y], c.kern[X][X], c.kern[Y][Y], c.kern[X][Y], Nx, Ny) c.bwFFT(X, outp) c.bwFFT(Y, outp) } func (c *DemagConvolution) is2D() bool { return c.inputSize[Z] == 1 } // zero 1-component slice func zero1_async(dst *data.Slice) { cu.MemsetD32Async(cu.DevicePtr(uintptr(dst.DevPtr(0))), 0, int64(dst.Len()), stream0) } // forward FFT component i func (c *DemagConvolution) fwFFT(i int, inp, vol *data.Slice, Msat MSlice) { zero1_async(c.fftRBuf[i]) in := inp.Comp(i) copyPadMul(c.fftRBuf[i], in, vol, c.realKernSize, c.inputSize, Msat) c.fwPlan.ExecAsync(c.fftRBuf[i], c.fftCBuf[i]) } // backward FFT component i func (c *DemagConvolution) bwFFT(i int, outp *data.Slice) { c.bwPlan.ExecAsync(c.fftCBuf[i], c.fftRBuf[i]) out := outp.Comp(i) copyUnPad(out, c.fftRBuf[i], c.inputSize, c.realKernSize) } func (c *DemagConvolution) init(realKern [3][3]*data.Slice) { // init device buffers // 2D re-uses fftBuf[X] as fftBuf[Z], 3D needs all 3 fftBufs. nc := fftR2COutputSizeFloats(c.realKernSize) c.fftCBuf[X] = NewSlice(1, nc) c.fftCBuf[Y] = NewSlice(1, nc) if c.is2D() { c.fftCBuf[Z] = c.fftCBuf[X] } else { c.fftCBuf[Z] = NewSlice(1, nc) } c.fftRBuf[X] = NewSlice(1, c.realKernSize) c.fftRBuf[Y] = NewSlice(1, c.realKernSize) if c.is2D() { c.fftRBuf[Z] = c.fftRBuf[X] } else { c.fftRBuf[Z] = NewSlice(1, c.realKernSize) } // init FFT plans c.fwPlan = newFFT3DR2C(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) c.bwPlan = newFFT3DC2R(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) // init FFT kernel // logic size of FFT(kernel): store real parts only c.fftKernLogicSize = fftR2COutputSizeFloats(c.realKernSize) util.Assert(c.fftKernLogicSize[X]%2 == 0) c.fftKernLogicSize[X] /= 2 // physical size of FFT(kernel): store only non-redundant part exploiting Y, Z mirror symmetry // X mirror symmetry already exploited: FFT(kernel) is purely real. physKSize := [3]int{c.fftKernLogicSize[X], c.fftKernLogicSize[Y]/2 + 1, c.fftKernLogicSize[Z]/2 + 1} output := c.fftCBuf[0] input := c.fftRBuf[0] fftKern := data.NewSlice(1, physKSize) kfull := data.NewSlice(1, output.Size()) // not yet exploiting symmetry kfulls := kfull.Scalars() kCSize := physKSize kCSize[X] *= 2 // size of kernel after removing Y,Z redundant parts, but still complex kCmplx := data.NewSlice(1, kCSize) // not yet exploiting X symmetry kc := kCmplx.Scalars() for i := 0; i < 3; i++ { for j := i; j < 3; j++ { // upper triangular part if realKern[i][j] != nil { // ignore 0's // FW FFT data.Copy(input, realKern[i][j]) c.fwPlan.ExecAsync(input, output) data.Copy(kfull, output) // extract non-redundant part (Y,Z symmetry) for iz := 0; iz < kCSize[Z]; iz++ { for iy := 0; iy < kCSize[Y]; iy++ { for ix := 0; ix < kCSize[X]; ix++ { kc[iz][iy][ix] = kfulls[iz][iy][ix] } } } // extract real parts (X symmetry) scaleRealParts(fftKern, kCmplx, 1/float32(c.fwPlan.InputLen())) c.kern[i][j] = GPUCopy(fftKern) } } } } func (c *DemagConvolution) Free() { if c == nil { return } c.inputSize = [3]int{} c.realKernSize = [3]int{} for i := 0; i < 3; i++ { c.fftCBuf[i].Free() c.fftRBuf[i].Free() c.fftCBuf[i] = nil c.fftRBuf[i] = nil for j := 0; j < 3; j++ { c.kern[i][j].Free() c.kern[i][j] = nil } c.fwPlan.Free() c.bwPlan.Free() cudaCtx.SetCurrent() } } 3-3.11.1/cuda/conv_kernmul.go000066400000000000000000000034161503346766200157240ustar00rootroot00000000000000package cuda // Kernel multiplication for purely real kernel, symmetric around Y axis (apart from first row). // Launch configs range over all complex elements of fft input. This could be optimized: range only over kernel. import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // kernel multiplication for 3D demag convolution, exploiting full kernel symmetry. func kernMulRSymm3D_async(fftM [3]*data.Slice, Kxx, Kyy, Kzz, Kyz, Kxz, Kxy *data.Slice, Nx, Ny, Nz int) { util.Argument(fftM[X].NComp() == 1 && Kxx.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, Nz}) k_kernmulRSymm3D_async(fftM[X].DevPtr(0), fftM[Y].DevPtr(0), fftM[Z].DevPtr(0), Kxx.DevPtr(0), Kyy.DevPtr(0), Kzz.DevPtr(0), Kyz.DevPtr(0), Kxz.DevPtr(0), Kxy.DevPtr(0), Nx, Ny, Nz, cfg) } // kernel multiplication for 2D demag convolution on X and Y, exploiting full kernel symmetry. func kernMulRSymm2Dxy_async(fftMx, fftMy, Kxx, Kyy, Kxy *data.Slice, Nx, Ny int) { util.Argument(fftMy.NComp() == 1 && Kxx.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, 1}) k_kernmulRSymm2Dxy_async(fftMx.DevPtr(0), fftMy.DevPtr(0), Kxx.DevPtr(0), Kyy.DevPtr(0), Kxy.DevPtr(0), Nx, Ny, cfg) } // kernel multiplication for 2D demag convolution on Z, exploiting full kernel symmetry. func kernMulRSymm2Dz_async(fftMz, Kzz *data.Slice, Nx, Ny int) { util.Argument(fftMz.NComp() == 1 && Kzz.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, 1}) k_kernmulRSymm2Dz_async(fftMz.DevPtr(0), Kzz.DevPtr(0), Nx, Ny, cfg) } // kernel multiplication for general 1D convolution. Does not assume any symmetry. // Used for MFM images. func kernMulC_async(fftM, K *data.Slice, Nx, Ny int) { util.Argument(fftM.NComp() == 1 && K.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, 1}) k_kernmulC_async(fftM.DevPtr(0), K.DevPtr(0), Nx, Ny, cfg) } 3-3.11.1/cuda/conv_mfm.go000066400000000000000000000056541503346766200150340ustar00rootroot00000000000000package cuda // Generation of Magnetic Force Microscopy images. import ( "github.com/mumax/3/data" "github.com/mumax/3/mag" ) // Stores the necessary state to perform FFT-accelerated convolution type MFMConvolution struct { size [3]int // 3D size of the input/output data kernSize [3]int // Size of kernel and logical FFT size. fftKernSize [3]int // fftRBuf *data.Slice // FFT input buf for FFT, shares storage with fftCBuf. fftCBuf *data.Slice // FFT output buf, shares storage with fftRBuf gpuFFTKern [3]*data.Slice // FFT kernel on device fwPlan fft3DR2CPlan // Forward FFT (1 component) bwPlan fft3DC2RPlan // Backward FFT (1 component) kern [3]*data.Slice // Real-space kernel (host) mesh *data.Mesh } func (c *MFMConvolution) Free() { if c == nil { return } c.size = [3]int{} c.kernSize = [3]int{} c.fftCBuf.Free() // shared with fftRbuf c.fftCBuf = nil c.fftRBuf = nil for j := 0; j < 3; j++ { c.gpuFFTKern[j].Free() c.gpuFFTKern[j] = nil c.kern[j] = nil } c.fwPlan.Free() c.bwPlan.Free() cudaCtx.SetCurrent() } func (c *MFMConvolution) init() { // init FFT plans padded := c.kernSize c.fwPlan = newFFT3DR2C(padded[X], padded[Y], padded[Z]) c.bwPlan = newFFT3DC2R(padded[X], padded[Y], padded[Z]) // init device buffers nc := fftR2COutputSizeFloats(c.kernSize) c.fftCBuf = NewSlice(1, nc) c.fftRBuf = NewSlice(1, c.kernSize) c.gpuFFTKern[X] = NewSlice(1, nc) c.gpuFFTKern[Y] = NewSlice(1, nc) c.gpuFFTKern[Z] = NewSlice(1, nc) c.initFFTKern3D() } func (c *MFMConvolution) initFFTKern3D() { c.fftKernSize = fftR2COutputSizeFloats(c.kernSize) for i := 0; i < 3; i++ { zero1_async(c.fftRBuf) data.Copy(c.fftRBuf, c.kern[i]) c.fwPlan.ExecAsync(c.fftRBuf, c.fftCBuf) scale := 2 / float32(c.fwPlan.InputLen()) // ?? zero1_async(c.gpuFFTKern[i]) Madd2(c.gpuFFTKern[i], c.gpuFFTKern[i], c.fftCBuf, 0, scale) } } // store MFM image in output, based on magnetization in inp. func (c *MFMConvolution) Exec(outp, inp, vol *data.Slice, Msat MSlice) { for i := 0; i < 3; i++ { zero1_async(c.fftRBuf) copyPadMul(c.fftRBuf, inp.Comp(i), vol, c.kernSize, c.size, Msat) c.fwPlan.ExecAsync(c.fftRBuf, c.fftCBuf) Nx, Ny := c.fftKernSize[X]/2, c.fftKernSize[Y] // ?? kernMulC_async(c.fftCBuf, c.gpuFFTKern[i], Nx, Ny) c.bwPlan.ExecAsync(c.fftCBuf, c.fftRBuf) copyUnPad(outp.Comp(i), c.fftRBuf, c.size, c.kernSize) } } func (c *MFMConvolution) Reinit(lift, tipsize float64, cachedir string) { c.kern = mag.MFMKernel(c.mesh, lift, tipsize, cachedir) c.initFFTKern3D() } // Initializes a convolution to evaluate the demag field for the given mesh geometry. func NewMFM(mesh *data.Mesh, lift, tipsize float64, cachedir string) *MFMConvolution { k := mag.MFMKernel(mesh, lift, tipsize, cachedir) size := mesh.Size() c := new(MFMConvolution) c.size = size c.kern = k c.kernSize = k[X].Size() c.init() c.mesh = mesh return c } 3-3.11.1/cuda/conv_selftest.go000066400000000000000000000072611503346766200161020ustar00rootroot00000000000000package cuda // Convolution self-test, performed once at the start of each simulation import ( "math/rand" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Compares FFT-accelerated convolution against brute-force on sparse data. // This is not really needed but very quickly uncovers newly introduced bugs. func testConvolution(c *DemagConvolution, PBC [3]int, realKern [3][3]*data.Slice) { if PBC != [3]int{0, 0, 0} { // the brute-force method does not work for pbc. util.Log("skipping convolution self-test for PBC") return } util.Log("//convolution self-test...") inhost := data.NewSlice(3, c.inputSize) initConvTestInput(inhost.Vectors()) gpu := NewSlice(3, c.inputSize) defer gpu.Free() data.Copy(gpu, inhost) Msat := NewSlice(1, [3]int{1, 1, 256}) defer Msat.Free() Memset(Msat, 1) vol := data.NilSlice(1, c.inputSize) c.Exec(gpu, gpu, vol, ToMSlice(Msat)) output := gpu.HostCopy() brute := data.NewSlice(3, c.inputSize) bruteConv(inhost.Vectors(), brute.Vectors(), realKern) a, b := output.Host(), brute.Host() err := float32(0) for c := range a { for i := range a[c] { if fabs(a[c][i]-b[c][i]) > err { err = fabs(a[c][i] - b[c][i]) } } } if err > CONV_TOLERANCE { util.Fatal("convolution self-test tolerance: ", err, " FAIL") } } // Maximum tolerable error on demag convolution self-test. const CONV_TOLERANCE = 1e-6 // Brute-force O(N²) vector convolution on CPU. // Used to verify GPU FFT convolution. // Input better be sparse. // A nil kernel element is interpreted as all 0s. // Kernel indices are destination index, source index. // // (O0) (K01 K02 K03) (I0) // (O1) = (K11 K12 K13) * (I1) // (O2) (K21 K22 K23) (I2) func bruteConv(in, out [3][][][]float32, kernel [3][3]*data.Slice) { var kern [3][3][][][]float32 for i := range kern { for j := range kern[i] { if kernel[i][j] != nil { kern[i][j] = kernel[i][j].Scalars() } } } size := sizeOf(in[0]) ksize := sizeOf(kern[0][0]) // Zero output first for c := 0; c < 3; c++ { for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { out[c][iz][iy][ix] = 0 } } } } for sc := 0; sc < 3; sc++ { for sz := 0; sz < size[Z]; sz++ { for sy := 0; sy < size[Y]; sy++ { for sx := 0; sx < size[X]; sx++ { if in[sc][sz][sy][sx] == 0 { continue // skip zero source } for dc := 0; dc < 3; dc++ { // dest component if kern[dc][sc] == nil { continue // skip zero kernel } for dz := 0; dz < size[Z]; dz++ { k := wrap(dz-sz, ksize[Z]) for dy := 0; dy < size[Y]; dy++ { j := wrap(dy-sy, ksize[Y]) for dx := 0; dx < size[X]; dx++ { i := wrap(dx-sx, ksize[X]) out[dc][dz][dy][dx] += in[sc][sz][sy][sx] * kern[dc][sc][k][j][i] } } } } } } } } } // Wraps an index to [0, max] (python-like modulus) func wrap(number, max int) int { for number < 0 { number += max } for number >= max { number -= max } return number } // generate sparse input data for testing the convolution. func initConvTestInput(input [3][][][]float32) { rng := rand.New(rand.NewSource(0)) // reproducible tests size := sizeOf(input[0]) Nx, Ny, Nz := size[X], size[Y], size[Z] ixs := [...]int{0, Nx / 5, Nx / 2, Nx - 1} iys := [...]int{0, Ny / 7, Ny / 2, Ny - 1} izs := [...]int{0, Nz / 11, Nz / 2, Nz - 1} for c := range input { for _, i := range izs { for _, j := range iys { for _, k := range ixs { input[c][i][j][k] = 1 - 2*rng.Float32() } } } } } // Returns the x, y, z size of block func sizeOf(block [][][]float32) [3]int { return [3]int{len(block[0][0]), len(block[0]), len(block)} } 3-3.11.1/cuda/copypadmul2.cu000066400000000000000000000014761503346766200154670ustar00rootroot00000000000000#include "amul.h" #include "constants.h" #include "stencil.h" #include // Copy src (size S, smaller) into dst (size D, larger), // and multiply by Bsat * vol extern "C" __global__ void copypadmul2(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ vol) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` copypadmul2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; $L__BB0_3: setp.eq.s64 %p7, %rd4, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; bra.uni $L__BB0_6; $L__BB0_5: mov.f32 %f13, 0f3F800000; $L__BB0_6: cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r4, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f8, [%rd13]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f13, %f9; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f11; $L__BB0_7: ret; } ` ) 3-3.11.1/cuda/copyunpad.cu000066400000000000000000000007741503346766200152320ustar00rootroot00000000000000#include "stencil.h" // Copy src (size S, larger) to dst (size D, smaller) extern "C" __global__ void copyunpad(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` copyunpad_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/crop.cu000066400000000000000000000007751503346766200141740ustar00rootroot00000000000000#include "stencil.h" // See crop.go extern "C" __global__ void crop(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz, int Offx, int Offy, int Offz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` crop_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r12, %r13, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r16, %r15, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r19, %r18, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/crossproduct.cu000066400000000000000000000011631503346766200157530ustar00rootroot00000000000000#include "float3.h" extern "C" __global__ void crossproduct(float* __restrict__ dstx, float* __restrict__ dsty, float* __restrict__ dstz, float* __restrict__ ax, float* __restrict__ ay, float* __restrict__ az, float* __restrict__ bx, float* __restrict__ by, float* __restrict__ bz, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 A = {ax[i], ay[i], az[i]}; float3 B = {bx[i], by[i], bz[i]}; float3 AxB = cross(A, B); dstx[i] = AxB.x; dsty[i] = AxB.y; dstz[i] = AxB.z; } } 3-3.11.1/cuda/crossproduct.go000066400000000000000000000007051503346766200157520ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) func CrossProduct(dst, a, b *data.Slice) { util.Argument(dst.NComp() == 3 && a.NComp() == 3 && b.NComp() == 3) util.Argument(dst.Len() == a.Len() && dst.Len() == b.Len()) N := dst.Len() cfg := make1DConf(N) k_crossproduct_async(dst.DevPtr(X), dst.DevPtr(Y), dst.DevPtr(Z), a.DevPtr(X), a.DevPtr(Y), a.DevPtr(Z), b.DevPtr(X), b.DevPtr(Y), b.DevPtr(Z), N, cfg) } 3-3.11.1/cuda/crossproduct_wrapper.go000066400000000000000000001101541503346766200175120ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for crossproduct kernel var crossproduct_code cu.Function // Stores the arguments for crossproduct kernel invocation type crossproduct_args_t struct { arg_dstx unsafe.Pointer arg_dsty unsafe.Pointer arg_dstz unsafe.Pointer arg_ax unsafe.Pointer arg_ay unsafe.Pointer arg_az unsafe.Pointer arg_bx unsafe.Pointer arg_by unsafe.Pointer arg_bz unsafe.Pointer arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for crossproduct kernel invocation var crossproduct_args crossproduct_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. crossproduct_args.argptr[0] = unsafe.Pointer(&crossproduct_args.arg_dstx) crossproduct_args.argptr[1] = unsafe.Pointer(&crossproduct_args.arg_dsty) crossproduct_args.argptr[2] = unsafe.Pointer(&crossproduct_args.arg_dstz) crossproduct_args.argptr[3] = unsafe.Pointer(&crossproduct_args.arg_ax) crossproduct_args.argptr[4] = unsafe.Pointer(&crossproduct_args.arg_ay) crossproduct_args.argptr[5] = unsafe.Pointer(&crossproduct_args.arg_az) crossproduct_args.argptr[6] = unsafe.Pointer(&crossproduct_args.arg_bx) crossproduct_args.argptr[7] = unsafe.Pointer(&crossproduct_args.arg_by) crossproduct_args.argptr[8] = unsafe.Pointer(&crossproduct_args.arg_bz) crossproduct_args.argptr[9] = unsafe.Pointer(&crossproduct_args.arg_N) } // Wrapper for crossproduct CUDA kernel, asynchronous. func k_crossproduct_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("crossproduct") } crossproduct_args.Lock() defer crossproduct_args.Unlock() if crossproduct_code == 0 { crossproduct_code = fatbinLoad(crossproduct_map, "crossproduct") } crossproduct_args.arg_dstx = dstx crossproduct_args.arg_dsty = dsty crossproduct_args.arg_dstz = dstz crossproduct_args.arg_ax = ax crossproduct_args.arg_ay = ay crossproduct_args.arg_az = az crossproduct_args.arg_bx = bx crossproduct_args.arg_by = by crossproduct_args.arg_bz = bz crossproduct_args.arg_N = N args := crossproduct_args.argptr[:] cu.LaunchKernel(crossproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("crossproduct") } } // maps compute capability on PTX code for crossproduct kernel. var crossproduct_map = map[int]string{0: "", 50: crossproduct_ptx_50, 52: crossproduct_ptx_52, 53: crossproduct_ptx_53, 60: crossproduct_ptx_60, 61: crossproduct_ptx_61, 62: crossproduct_ptx_62, 70: crossproduct_ptx_70, 72: crossproduct_ptx_72, 75: crossproduct_ptx_75, 80: crossproduct_ptx_80, 86: crossproduct_ptx_86, 87: crossproduct_ptx_87, 89: crossproduct_ptx_89, 90: crossproduct_ptx_90} // crossproduct PTX code for various compute capabilities. const ( crossproduct_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` crossproduct_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/cu/000077500000000000000000000000001503346766200132765ustar00rootroot000000000000003-3.11.1/cuda/cu/Makefile000066400000000000000000000006351503346766200147420ustar00rootroot00000000000000all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/cu > README 3-3.11.1/cuda/cu/README000066400000000000000000000551521503346766200141660ustar00rootroot00000000000000PACKAGE DOCUMENTATION package cu import "github.com/barnex/cuda5/cu" Go bindings for the CUDA driver API. CONSTANTS const ( // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO // Spin when waiting for results from the GPU. CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN // Yield its thread when waiting for results from the GPU. CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. CTX_BLOCKING_SYNC // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. CTX_MAP_HOST = C.CU_CTX_MAP_HOST //Do not reduce local memory after resizing local memory for a kernel. CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX ) Flags for CtxCreate const ( SIZEOF_FLOAT32 = 4 SIZEOF_FLOAT64 = 8 SIZEOF_COMPLEX64 = 8 SIZEOF_COMPLEX128 = 16 ) Type size in bytes FUNCTIONS func CtxDestroy(ctx *Context) Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. func CtxDisablePeerAccess(peer Context) Reverses CtxEnablePeerAccess(). func CtxEnablePeerAccess(peer Context) Make allocations from the peer Context available to the current context. func CtxGetApiVersion(ctx Context) (version int) Returns the API version to create the context. func CtxSetCurrent(ctx Context) Sets the current active context. func CtxSynchronize() Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. func DeviceCanAccessPeer(dev, peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func DeviceComputeCapability(device Device) (major, minor int) Returns the compute capability of the device. func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int Gets the value of a device attribute. func DeviceGetCount() int Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. func DeviceGetName(dev Device) string Gets the name of the device. func DeviceTotalMem(device Device) int64 Returns the total amount of memory available on the device in bytes. func FuncGetAttribute(attrib FunctionAttribute, function Function) int func Init(flags int) Initialize the CUDA driver API. Currently, flags must be 0. If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) func MemAllocHost(bytes int64) unsafe.Pointer func MemFree(p DevicePtr) Frees device memory allocated by MemAlloc(). It is safe to double-free. func MemFreeHost(ptr unsafe.Pointer) func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func MemGetInfo() (free, total int64) Returns the free and total amount of memroy in the current Context (in bytes). func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) Page-locks memory specified by the pointer and bytes. The pointer and byte size must be aligned to the host page size (4KB) See also: MemHostUnregister() func MemHostUnregister(ptr unsafe.Pointer) Unmaps memory locked by MemHostRegister(). func Memcpy(dst, src DevicePtr, bytes int64) Copies a number of bytes on the current device. Requires unified addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually an auto copy for device and/or host memory func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes on the current device. func MemcpyDtoD(dst, src DevicePtr, bytes int64) Copies a number of bytes from host to device. func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) Copies a number of bytes from device to host. func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes device host to host. The host memory must be page-locked (see MemRegister) func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) Copies a number of bytes from host to device. func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. The host memory must be page-locked (see MemRegister) func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) Copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) Asynchronously copies from device memory in one context (device) to another. func MemsetD32(deviceptr DevicePtr, value uint32, N int64) Sets the first N 32-bit values of dst array to value. Asynchronous. func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8(deviceptr DevicePtr, value uint8, N int64) Sets the first N 8-bit values of dst array to value. Asynchronous. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func StreamDestroy(stream *Stream) Destroys an asynchronous stream func StreamSynchronize(stream Stream) Blocks until the stream has completed. func Version() int Returns the CUDA driver version. TYPES type Context uintptr CUDA context. func CtxCreate(flags uint, dev Device) Context Create a CUDA context. func CtxGetCurrent() Context Gets the current active context. func (ctx Context) ApiVersion() (version int) Returns the API version to create the context. func (ctx *Context) Destroy() Destroys the CUDA context. func (peer Context) DisablePeerAccess() Reverses EnablePeerAccess(). func (peer Context) EnablePeerAccess() Make allocations from the peer Context available to the current context. func (ctx Context) SetCurrent() Sets the current active context. type DevProp struct { MaxThreadsPerBlock int MaxThreadsDim [3]int MaxGridSize [3]int SharedMemPerBlock int TotalConstantMemory int SIMDWidth int MemPitch int RegsPerBlock int ClockRate int TextureAlign int } Device properties func DeviceGetProperties(dev Device) (prop DevProp) Returns the device's properties. type Device int CUDA Device number. func CtxGetDevice() Device Returns the ordinal of the current context's device. func DeviceGet(ordinal int) Device Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. func (dev Device) Attribute(attrib DeviceAttribute) int Gets the value of a device attribute. func (dev Device) CanAccessPeer(peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func (device Device) ComputeCapability() (major, minor int) Returns the compute capability of the device. func (dev Device) Name() string Gets the name of the device. func (dev Device) Properties() DevProp Returns the device's properties. func (device Device) TotalMem() int64 Returns the total amount of memory available on the device in bytes. type DeviceAttribute int const ( MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture ) type DevicePtr uintptr func MemAlloc(bytes int64) DevicePtr Allocates a number of bytes of device memory. func (ptr DevicePtr) Bytes() (bytes int64) Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) Free() Frees device memory allocated by MemAlloc(). Overwrites the pointer with NULL. It is safe to double-free. func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) MemoryType() MemoryType Returns the physical memory type that ptr addresses. func (p DevicePtr) String() string type Dim3 struct { X, Y, Z int } type Function uintptr Represents a CUDA CUfunction, a reference to a function within a module. func ModuleGetFunction(module Module, name string) Function Returns a Function handle. func (f Function) GetAttribute(attrib FunctionAttribute) int type FunctionAttribute int const ( FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. ) type MemHostRegisterFlag int const ( // Memory is pinned in all CUDA contexts. MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP ) Flag for MemHostRegister type MemoryType uint Physical memory type of device pointer. const ( MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED ) func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) Returns the physical memory type that ptr addresses. func (t MemoryType) String() string type Module uintptr Represents a CUDA CUmodule, a reference to executable device code. func ModuleLoad(fname string) Module Loads a compute module from file func ModuleLoadData(image string) Module Loads a compute module from string func (m Module) GetFunction(name string) Function Returns a Function handle. type Result int CUDA error status. CUDA error statuses are not returned by functions but checked and passed to panic() when not successful. If desired, they can be caught by recover(). const ( SUCCESS Result = C.CUDA_SUCCESS ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ERROR_HARDWARE_STACK_ERROR Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR ERROR_ILLEGAL_INSTRUCTION Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION ERROR_MISALIGNED_ADDRESS Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS ERROR_INVALID_ADDRESS_SPACE Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE ERROR_INVALID_PC Result = 718 //C.CUDA_ERROR_INVALID_PC ERROR_NOT_PERMITTED Result = 800 //C.CUDA_ERROR_NOT_PERMITTED ERROR_NOT_SUPPORTED Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN ) func StreamQuery(stream Stream) Result Returns Success if all operations have completed, ErrorNotReady otherwise func (err Result) String() string Message string for the error type Stream uintptr CUDA stream. func StreamCreate() Stream Creates an asynchronous stream func (stream *Stream) Destroy() Destroys the asynchronous stream func (stream Stream) Query() Result Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Synchronize() Blocks until the stream has completed. 3-3.11.1/cuda/cu/cgoflags.go000066400000000000000000000010131503346766200154050ustar00rootroot00000000000000package cu // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcuda // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64/stubs/ //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////Ubuntu 15.04: //#cgo LDFLAGS:-L/usr/lib/x86_64-linux-gnu/ //#cgo CFLAGS: -I/usr/include // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/include import "C" 3-3.11.1/cuda/cu/context.go000066400000000000000000000060261503346766200153150ustar00rootroot00000000000000package cu // This file implements CUDA driver context management //#include import "C" import "unsafe" // CUDA context. type Context uintptr // Create a CUDA context. func CtxCreate(flags uint, dev Device) Context { var ctx C.CUcontext err := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return Context(uintptr(unsafe.Pointer(ctx))) } // Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. func CtxDestroy(ctx *Context) { err := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx))))) *ctx = 0 if err != SUCCESS { panic(err) } } // Destroys the CUDA context. func (ctx *Context) Destroy() { CtxDestroy(ctx) } // Returns the API version to create the context. func CtxGetApiVersion(ctx Context) (version int) { var cversion C.uint err := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion)) if err != SUCCESS { panic(err) } version = int(cversion) return } // Returns the API version to create the context. func (ctx Context) ApiVersion() (version int) { return CtxGetApiVersion(ctx) } // Gets the current active context. func CtxGetCurrent() Context { var ctx C.CUcontext err := Result(C.cuCtxGetCurrent(&ctx)) if err != SUCCESS { panic(err) } return Context(uintptr(unsafe.Pointer(ctx))) } // Returns the ordinal of the current context's device. func CtxGetDevice() Device { var dev C.CUdevice err := Result(C.cuCtxGetDevice(&dev)) if err != SUCCESS { panic(err) } return Device(dev) } // Sets the current active context. func CtxSetCurrent(ctx Context) { err := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx))))) if err != SUCCESS { panic(err) } } // Sets the current active context. func (ctx Context) SetCurrent() { CtxSetCurrent(ctx) } // Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. func CtxSynchronize() { err := Result(C.cuCtxSynchronize()) if err != SUCCESS { panic(err) } } // Flags for CtxCreate const ( // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO // Spin when waiting for results from the GPU. CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN // Yield its thread when waiting for results from the GPU. CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. CTX_BLOCKING_SYNC // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. CTX_MAP_HOST = C.CU_CTX_MAP_HOST //Do not reduce local memory after resizing local memory for a kernel. CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX ) 3-3.11.1/cuda/cu/context_test.go000066400000000000000000000012111503346766200163430ustar00rootroot00000000000000package cu import ( "fmt" "testing" ) func TestContext(t *testing.T) { fmt.Println("CtxCreate") ctx := CtxCreate(CTX_SCHED_AUTO, 0) fmt.Println("CtxSetCurrent") CtxSetCurrent(ctx) fmt.Println("CtxGetApiVersion:", ctx.ApiVersion()) fmt.Println("CtxGetDevice:", CtxGetDevice()) (&ctx).Destroy() } func BenchmarkGetContext(b *testing.B) { b.StopTimer() ctx := CtxCreate(CTX_SCHED_AUTO, 0) CtxSetCurrent(ctx) b.StartTimer() for i := 0; i < b.N; i++ { CtxGetCurrent() } } func BenchmarkSetContext(b *testing.B) { b.StopTimer() ctx := CtxCreate(CTX_SCHED_AUTO, 0) b.StartTimer() for i := 0; i < b.N; i++ { ctx.SetCurrent() } } 3-3.11.1/cuda/cu/device.go000066400000000000000000000234161503346766200150720ustar00rootroot00000000000000package cu // This file implements CUDA driver device management //#include import "C" // CUDA Device number. type Device int // Returns the compute capability of the device. func DeviceComputeCapability(device Device) (major, minor int) { major = device.Attribute(COMPUTE_CAPABILITY_MAJOR) minor = device.Attribute(COMPUTE_CAPABILITY_MINOR) return } // Returns the compute capability of the device. func (device Device) ComputeCapability() (major, minor int) { return DeviceComputeCapability(device) } // Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. func DeviceGet(ordinal int) Device { var device C.CUdevice err := Result(C.cuDeviceGet(&device, C.int(ordinal))) if err != SUCCESS { panic(err) } return Device(device) } // Gets the value of a device attribute. func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int { var attr C.int err := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return int(attr) } // Gets the value of a device attribute. func (dev Device) Attribute(attrib DeviceAttribute) int { return DeviceGetAttribute(attrib, dev) } // Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. func DeviceGetCount() int { var count C.int err := Result(C.cuDeviceGetCount(&count)) if err != SUCCESS { panic(err) } return int(count) } // Gets the name of the device. func DeviceGetName(dev Device) string { size := 256 buf := make([]byte, size) cstr := C.CString(string(buf)) err := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return C.GoString(cstr) } // Gets the name of the device. func (dev Device) Name() string { return DeviceGetName(dev) } // Device properties type DevProp struct { MaxThreadsPerBlock int MaxThreadsDim [3]int MaxGridSize [3]int SharedMemPerBlock int TotalConstantMemory int SIMDWidth int MemPitch int RegsPerBlock int ClockRate int TextureAlign int } // Returns the dev's properties. func DeviceGetProperties(dev Device) (prop DevProp) { prop.MaxThreadsPerBlock = dev.Attribute(MAX_THREADS_PER_BLOCK) prop.MaxThreadsDim[0] = dev.Attribute(MAX_BLOCK_DIM_X) prop.MaxThreadsDim[1] = dev.Attribute(MAX_BLOCK_DIM_Y) prop.MaxThreadsDim[2] = dev.Attribute(MAX_BLOCK_DIM_Z) prop.MaxGridSize[0] = dev.Attribute(MAX_GRID_DIM_X) prop.MaxGridSize[1] = dev.Attribute(MAX_GRID_DIM_Y) prop.MaxGridSize[2] = dev.Attribute(MAX_GRID_DIM_Z) prop.SharedMemPerBlock = dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK) prop.TotalConstantMemory = dev.Attribute(TOTAL_CONSTANT_MEMORY) prop.SIMDWidth = dev.Attribute(WARP_SIZE) prop.MemPitch = dev.Attribute(MAX_PITCH) prop.RegsPerBlock = dev.Attribute(MAX_REGISTERS_PER_BLOCK) prop.ClockRate = dev.Attribute(CLOCK_RATE) prop.TextureAlign = dev.Attribute(TEXTURE_ALIGNMENT) return } // Returns the device's properties. func (dev Device) Properties() DevProp { return DeviceGetProperties(dev) } // Returns the total amount of memory available on the device in bytes. func (device Device) TotalMem() int64 { return DeviceTotalMem(device) } // Returns the total amount of memory available on the device in bytes. func DeviceTotalMem(device Device) int64 { var bytes C.size_t err := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device))) if err != SUCCESS { panic(err) } return int64(bytes) } type DeviceAttribute int const ( MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture COMPUTE_CAPABILITY_MAJOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR // Major compute capability version number COMPUTE_CAPABILITY_MINOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR // Minor compute capability version number ) 3-3.11.1/cuda/cu/device_test.go000066400000000000000000000106541503346766200161310ustar00rootroot00000000000000package cu import ( "fmt" "testing" ) func TestDevice(t *testing.T) { fmt.Println("DeviceGetCount:", DeviceGetCount()) for i := 0; i < DeviceGetCount(); i++ { fmt.Println("DeviceGet", i) dev := DeviceGet(i) major, minor := dev.ComputeCapability() fmt.Println("Name: ", dev.Name()) fmt.Println("ComputeCapability: ", major, minor) fmt.Println("TotalMem: ", dev.TotalMem()) fmt.Println("ATTRIBUTE_MAX_THREADS_PER_BLOCK :", dev.Attribute(MAX_THREADS_PER_BLOCK)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_X :", dev.Attribute(MAX_BLOCK_DIM_X)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Y :", dev.Attribute(MAX_BLOCK_DIM_Y)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Z :", dev.Attribute(MAX_BLOCK_DIM_Z)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_X :", dev.Attribute(MAX_GRID_DIM_X)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Y :", dev.Attribute(MAX_GRID_DIM_Y)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Z :", dev.Attribute(MAX_GRID_DIM_Z)) fmt.Println("ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK :", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK)) fmt.Println("ATTRIBUTE_TOTAL_CONSTANT_MEMORY :", dev.Attribute(TOTAL_CONSTANT_MEMORY)) fmt.Println("ATTRIBUTE_WARP_SIZE :", dev.Attribute(WARP_SIZE)) fmt.Println("ATTRIBUTE_MAX_PITCH :", dev.Attribute(MAX_PITCH)) fmt.Println("ATTRIBUTE_MAX_REGISTERS_PER_BLOCK :", dev.Attribute(MAX_REGISTERS_PER_BLOCK)) fmt.Println("ATTRIBUTE_CLOCK_RATE :", dev.Attribute(CLOCK_RATE)) fmt.Println("ATTRIBUTE_TEXTURE_ALIGNMENT :", dev.Attribute(TEXTURE_ALIGNMENT)) fmt.Println("ATTRIBUTE_MULTIPROCESSOR_COUNT :", dev.Attribute(MULTIPROCESSOR_COUNT)) fmt.Println("ATTRIBUTE_KERNEL_EXEC_TIMEOUT :", dev.Attribute(KERNEL_EXEC_TIMEOUT)) fmt.Println("ATTRIBUTE_INTEGRATED :", dev.Attribute(INTEGRATED)) fmt.Println("ATTRIBUTE_CAN_MAP_HOST_MEMORY :", dev.Attribute(CAN_MAP_HOST_MEMORY)) fmt.Println("ATTRIBUTE_COMPUTE_MODE :", dev.Attribute(COMPUTE_MODE)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH :", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS)) fmt.Println("ATTRIBUTE_SURFACE_ALIGNMENT :", dev.Attribute(SURFACE_ALIGNMENT)) fmt.Println("ATTRIBUTE_CONCURRENT_KERNELS :", dev.Attribute(CONCURRENT_KERNELS)) fmt.Println("ATTRIBUTE_ECC_ENABLED :", dev.Attribute(ECC_ENABLED)) fmt.Println("ATTRIBUTE_PCI_BUS_ID :", dev.Attribute(PCI_BUS_ID)) fmt.Println("ATTRIBUTE_PCI_DEVICE_ID :", dev.Attribute(PCI_DEVICE_ID)) fmt.Println("ATTRIBUTE_TCC_DRIVER :", dev.Attribute(TCC_DRIVER)) fmt.Println("ATTRIBUTE_MEMORY_CLOCK_RATE :", dev.Attribute(MEMORY_CLOCK_RATE)) fmt.Println("ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH :", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH)) fmt.Println("ATTRIBUTE_L2_CACHE_SIZE :", dev.Attribute(L2_CACHE_SIZE)) fmt.Println("ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR :", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR)) fmt.Println("ATTRIBUTE_ASYNC_ENGINE_COUNT :", dev.Attribute(ASYNC_ENGINE_COUNT)) fmt.Println("ATTRIBUTE_UNIFIED_ADDRESSING :", dev.Attribute(UNIFIED_ADDRESSING)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS)) fmt.Printf("Properties:%#v\n", dev.Properties()) } } 3-3.11.1/cuda/cu/dim3.go000066400000000000000000000000561503346766200144620ustar00rootroot00000000000000package cu type Dim3 struct { X, Y, Z int } 3-3.11.1/cuda/cu/doc.go000066400000000000000000000000631503346766200143710ustar00rootroot00000000000000// Go bindings for the CUDA driver API. package cu 3-3.11.1/cuda/cu/execution.go000066400000000000000000000025041503346766200156310ustar00rootroot00000000000000package cu // This file implements execution of CUDA kernels //#include import "C" import ( "unsafe" ) const pointerSize = 8 // sorry, 64 bits only. func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) { // Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer, // so we copy the argument values go C memory first. argv := C.malloc(C.size_t(len(kernelParams) * pointerSize)) argp := C.malloc(C.size_t(len(kernelParams) * pointerSize)) defer C.free(argv) defer C.free(argp) for i := range kernelParams { *((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i) // argp[i] = &argv[i] *((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i] } err := Result(C.cuLaunchKernel( C.CUfunction(unsafe.Pointer(uintptr(f))), C.uint(gridDimX), C.uint(gridDimY), C.uint(gridDimZ), C.uint(blockDimX), C.uint(blockDimY), C.uint(blockDimZ), C.uint(sharedMemBytes), C.CUstream(unsafe.Pointer(uintptr(stream))), (*unsafe.Pointer)(argp), (*unsafe.Pointer)(unsafe.Pointer(uintptr(0))))) if err != SUCCESS { panic(err) } } func offset(ptr unsafe.Pointer, i int) unsafe.Pointer { return unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i)) } 3-3.11.1/cuda/cu/function.go000066400000000000000000000034451503346766200154600ustar00rootroot00000000000000package cu // This file implements manipulations on CUDA functions //#include import "C" import ( "unsafe" ) // Represents a CUDA CUfunction, a reference to a function within a module. type Function uintptr func FuncGetAttribute(attrib FunctionAttribute, function Function) int { var attr C.int err := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function))))) if err != SUCCESS { panic(err) } return int(attr) } func (f Function) GetAttribute(attrib FunctionAttribute) int { return FuncGetAttribute(attrib, f) } type FunctionAttribute int const ( FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. ) 3-3.11.1/cuda/cu/init.go000066400000000000000000000005621503346766200145730ustar00rootroot00000000000000package cu // This file implements CUDA driver initialization //#include import "C" // Initialize the CUDA driver API. // Currently, flags must be 0. // If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. func Init(flags int) { err := Result(C.cuInit(C.uint(flags))) if err != SUCCESS { panic(err) } } 3-3.11.1/cuda/cu/init_test.go000066400000000000000000000002651503346766200156320ustar00rootroot00000000000000package cu import ( "fmt" ) // needed for all other tests. func init() { Init(0) ctx := CtxCreate(CTX_SCHED_AUTO, 0) CtxSetCurrent(ctx) fmt.Println("Created CUDA context") } 3-3.11.1/cuda/cu/memory.go000066400000000000000000000170371503346766200151450ustar00rootroot00000000000000package cu // This file implements CUDA memory management on the driver level //#include import "C" import ( "fmt" "unsafe" ) type DevicePtr uintptr // Allocates a number of bytes of device memory. func MemAlloc(bytes int64) DevicePtr { var devptr C.CUdeviceptr err := Result(C.cuMemAlloc(&devptr, C.size_t(bytes))) if err != SUCCESS { panic(err) } return DevicePtr(devptr) } // Frees device memory allocated by MemAlloc(). // It is safe to double-free. func MemFree(p DevicePtr) { if p == DevicePtr(uintptr(0)) { return // Allready freed } err := Result(C.cuMemFree(C.CUdeviceptr(p))) if err != SUCCESS { panic(err) } } // Frees device memory allocated by MemAlloc(). // Overwrites the pointer with NULL. // It is safe to double-free. func (ptr DevicePtr) Free() { MemFree(ptr) } // Copies a number of bytes on the current device. // Requires unified addressing to be supported. // See also: MemcpyDtoD(). func Memcpy(dst, src DevicePtr, bytes int64) { err := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes on the current device. func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from host to device. func MemcpyDtoD(dst, src DevicePtr, bytes int64) { err := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes from host to device. func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from host to device. func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) { err := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes from host to device. // The host memory must be page-locked (see MemRegister) func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) { err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from device to host. func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) { err := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes device host to host. // The host memory must be page-locked (see MemRegister) func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies from device memory in one context (device) to another. func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) { err := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) { err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) { var cbytes C.size_t var cptr C.CUdeviceptr err := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr))) if err != SUCCESS { panic(err) } bytes = int64(cbytes) base = DevicePtr(cptr) return } // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) { return MemGetAddressRange(ptr) } // Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) Bytes() (bytes int64) { bytes, _ = MemGetAddressRange(ptr) return } // Returns the free and total amount of memroy in the current Context (in bytes). func MemGetInfo() (free, total int64) { var cfree, ctotal C.size_t err := Result(C.cuMemGetInfo(&cfree, &ctotal)) if err != SUCCESS { panic(err) } free = int64(cfree) total = int64(ctotal) return } // Page-locks memory specified by the pointer and bytes. // The pointer and byte size must be aligned to the host page size (4KB) // See also: MemHostUnregister() // doesn't link with cuda6.5 //func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) { // err := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags))) // if err != SUCCESS { // panic(err) // } //} // Unmaps memory locked by MemHostRegister(). // doesn't link with cuda6.5 //func MemHostUnregister(ptr unsafe.Pointer) { // err := Result(C.cuMemHostUnregister(ptr)) // if err != SUCCESS { // panic(err) // } //} func MemAllocHost(bytes int64) unsafe.Pointer { var p unsafe.Pointer err := Result(C.cuMemAllocHost(&p, C.size_t(bytes))) if err != SUCCESS { panic(err) } return p } func MemFreeHost(ptr unsafe.Pointer) { err := Result(C.cuMemFreeHost(ptr)) if err != SUCCESS { panic(err) } } type MemHostRegisterFlag int // Flag for MemHostRegister const ( // Memory is pinned in all CUDA contexts. MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP ) func (p DevicePtr) String() string { return fmt.Sprint(unsafe.Pointer(uintptr(p))) } // Type size in bytes const ( SIZEOF_FLOAT32 = 4 SIZEOF_FLOAT64 = 8 SIZEOF_COMPLEX64 = 8 SIZEOF_COMPLEX128 = 16 ) // Physical memory type of device pointer. type MemoryType uint const ( MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED ) var memorytype = map[MemoryType]string{ MemoryTypeHost: "MemoryTypeHost", MemoryTypeDevice: "MemoryTypeDevice", MemoryTypeArray: "MemoryTypeArray", MemoryTypeUnified: "MemoryTypeUnified"} func (t MemoryType) String() string { if s, ok := memorytype[t]; ok { return s } return "MemoryTypeUnknown" } // Returns the physical memory type that ptr addresses. func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) { var typ uint64 // foresee enough memory just to be safe err = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ), C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr)))) return MemoryType(uint(typ)), err } // Returns the physical memory type that ptr addresses. func (ptr DevicePtr) MemoryType() MemoryType { t, err := PointerGetAttributeMemoryType(ptr) if err != SUCCESS { panic(err) } return t } 3-3.11.1/cuda/cu/memory_test.go000066400000000000000000000103671503346766200162030ustar00rootroot00000000000000package cu import ( "fmt" "math" "testing" "unsafe" ) func TestMalloc(t *testing.T) { for i := 0; i < 1024; i++ { pointer := MemAlloc(16 * 1024 * 1024) pointer.Free() } for i := 0; i < 1024; i++ { pointer := MemAlloc(16 * 1024 * 1024) MemFree(pointer) } } func BenchmarkMallocFree1B(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1) m.Free() } } func BenchmarkMallocFree1kB(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1024) m.Free() } } func BenchmarkMallocFree1MB(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1024 * 1024) m.Free() } } func TestMemAddressRange(t *testing.T) { N := 12345 ptr := MemAlloc(int64(N)) size, base := MemGetAddressRange(ptr) if size != int64(N) { t.Fail() } if base != ptr { t.Fail() } size, base = 0, DevicePtr(0) size, base = ptr.GetAddressRange() if ptr.Bytes() != int64(N) { t.Fail() } } func TestMemGetInfo(t *testing.T) { free, total := MemGetInfo() fmt.Println("MemGetInfo: ", free, "/", total) if free > total { t.Fail() } if total == 0 { t.Fail() } } func TestMemsetAsync(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) str := StreamCreate() MemsetD32Async(dev1, math.Float32bits(42), N, str) MemsetD32Async(dev1, math.Float32bits(21), N/2, str) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) str.Synchronize() (&str).Destroy() for i := 0; i < len(host2)/2; i++ { if host2[i] != 21 { t.Fail() } } for i := len(host2) / 2; i < len(host2); i++ { if host2[i] != 42 { t.Fail() } } dev1.Free() } func TestMemset(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemsetD32(dev1, math.Float32bits(42), N) MemsetD32(dev1, math.Float32bits(21), N/2) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) for i := 0; i < len(host2)/2; i++ { if host2[i] != 21 { t.Fail() } } for i := len(host2) / 2; i < len(host2); i++ { if host2[i] != 42 { t.Fail() } } dev1.Free() } func TestMemcpy(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemcpyDtoD(dev2, dev1, 4*N) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func TestMemcpyAsync(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) stream := StreamCreate() MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) MemcpyDtoDAsync(dev2, dev1, 4*N, stream) MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) stream.Synchronize() for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func TestMemcpyAsyncRegistered(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) stream := StreamCreate() MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) MemcpyDtoDAsync(dev2, dev1, 4*N, stream) MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) stream.Synchronize() for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func BenchmarkMemcpy(b *testing.B) { b.StopTimer() N := int64(32 * 1024 * 1024) host1 := make([]float32, N) host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) defer dev1.Free() dev2 := MemAlloc(int64(4 * N)) defer dev2.Free() b.SetBytes(4 * N) b.StartTimer() for i := 0; i < b.N; i++ { MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemcpyDtoD(dev2, dev1, 4*N) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) } } 3-3.11.1/cuda/cu/memset.go000066400000000000000000000024001503346766200151130ustar00rootroot00000000000000package cu // This file implements CUDA memset functions. //#include import "C" import ( "unsafe" ) // Sets the first N 32-bit values of dst array to value. // Asynchronous. func MemsetD32(deviceptr DevicePtr, value uint32, N int64) { err := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N))) if err != SUCCESS { panic(err) } } // Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) { err := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Sets the first N 8-bit values of dst array to value. // Asynchronous. func MemsetD8(deviceptr DevicePtr, value uint8, N int64) { err := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N))) if err != SUCCESS { panic(err) } } // Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) { err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } 3-3.11.1/cuda/cu/module.go000066400000000000000000000022711503346766200151140ustar00rootroot00000000000000package cu // This file implements loading of CUDA ptx modules //#include import "C" import ( "unsafe" ) // Represents a CUDA CUmodule, a reference to executable device code. type Module uintptr // Loads a compute module from file func ModuleLoad(fname string) Module { //fmt.Fprintln(os.Stderr, "driver.ModuleLoad", fname) var mod C.CUmodule err := Result(C.cuModuleLoad(&mod, C.CString(fname))) if err != SUCCESS { panic(err) } return Module(uintptr(unsafe.Pointer(mod))) } // Loads a compute module from string func ModuleLoadData(image string) Module { var mod C.CUmodule err := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image)))) if err != SUCCESS { panic(err) } return Module(uintptr(unsafe.Pointer(mod))) } // Returns a Function handle. func ModuleGetFunction(module Module, name string) Function { var function C.CUfunction err := Result(C.cuModuleGetFunction( &function, C.CUmodule(unsafe.Pointer(uintptr(module))), C.CString(name))) if err != SUCCESS { panic(err) } return Function(uintptr(unsafe.Pointer(function))) } // Returns a Function handle. func (m Module) GetFunction(name string) Function { return ModuleGetFunction(m, name) } 3-3.11.1/cuda/cu/module_test.go000066400000000000000000000015161503346766200161540ustar00rootroot00000000000000package cu import ( "testing" "unsafe" //"fmt" ) func TestModule(test *testing.T) { mod := ModuleLoad("/testdata/testmodule.ptx") f := mod.GetFunction("testMemset") N := 1000 N4 := 4 * int64(N) a := make([]float32, N) A := MemAlloc(N4) defer A.Free() aptr := unsafe.Pointer(&a[0]) MemcpyHtoD(A, aptr, N4) var value float32 value = 42 var n int n = N / 2 block := 128 grid := DivUp(N, block) shmem := 0 args := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)} LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args) MemcpyDtoH(aptr, A, N4) for i := 0; i < N/2; i++ { if a[i] != 42 { test.Fail() } } for i := N / 2; i < N; i++ { if a[i] != 0 { test.Fail() } } //fmt.Println(a) } // Integer division rounded up. func DivUp(x, y int) int { return ((x - 1) / y) + 1 } 3-3.11.1/cuda/cu/peer.go000066400000000000000000000024571503346766200145700ustar00rootroot00000000000000package cu // This file implements CUDA unified addressing. //#include import "C" import ( "unsafe" ) // Make allocations from the peer Context available to the current context. func CtxEnablePeerAccess(peer Context) { err := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0))) if err != SUCCESS { panic(err) } } // Make allocations from the peer Context available to the current context. func (peer Context) EnablePeerAccess() { CtxEnablePeerAccess(peer) } // Reverses CtxEnablePeerAccess(). func CtxDisablePeerAccess(peer Context) { err := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))))) if err != SUCCESS { panic(err) } } // Reverses EnablePeerAccess(). func (peer Context) DisablePeerAccess() { CtxDisablePeerAccess(peer) } // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func DeviceCanAccessPeer(dev, peer Device) bool { var canAccessPeer C.int err := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer))) if err != SUCCESS { panic(err) } return int(canAccessPeer) != 0 } // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func (dev Device) CanAccessPeer(peer Device) bool { return DeviceCanAccessPeer(dev, peer) } 3-3.11.1/cuda/cu/result.go000066400000000000000000000207721503346766200151530ustar00rootroot00000000000000package cu // This file provides access to CUDA driver error statuses (type CUresult). //#include import "C" import ( "fmt" ) // CUDA error status. // CUDA error statuses are not returned by functions but checked and passed to // panic() when not successful. If desired, they can be caught by // recover(). type Result int // Message string for the error func (err Result) String() string { str, ok := errorString[err] if !ok { return "Unknown CUresult: " + fmt.Sprint(int(err)) } return str } const ( SUCCESS Result = C.CUDA_SUCCESS ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ERROR_HARDWARE_STACK_ERROR Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR ERROR_ILLEGAL_INSTRUCTION Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION ERROR_MISALIGNED_ADDRESS Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS ERROR_INVALID_ADDRESS_SPACE Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE ERROR_INVALID_PC Result = 718 //C.CUDA_ERROR_INVALID_PC ERROR_NOT_PERMITTED Result = 800 //C.CUDA_ERROR_NOT_PERMITTED ERROR_NOT_SUPPORTED Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN ) // Map with error strings for Result error numbers var errorString map[Result]string = map[Result]string{ SUCCESS: "CUDA_SUCCESS", ERROR_INVALID_VALUE: "CUDA_ERROR_INVALID_VALUE", ERROR_OUT_OF_MEMORY: "CUDA_ERROR_OUT_OF_MEMORY", ERROR_NOT_INITIALIZED: "CUDA_ERROR_NOT_INITIALIZED", ERROR_DEINITIALIZED: "CUDA_ERROR_DEINITIALIZED", ERROR_PROFILER_DISABLED: "CUDA_ERROR_PROFILER_DISABLED", ERROR_PROFILER_NOT_INITIALIZED: "CUDA_ERROR_PROFILER_NOT_INITIALIZED", ERROR_PROFILER_ALREADY_STARTED: "CUDA_ERROR_PROFILER_ALREADY_STARTED", ERROR_PROFILER_ALREADY_STOPPED: "CUDA_ERROR_PROFILER_ALREADY_STOPPED", ERROR_NO_DEVICE: "CUDA_ERROR_NO_DEVICE", ERROR_INVALID_DEVICE: "CUDA_ERROR_INVALID_DEVICE", ERROR_INVALID_IMAGE: "CUDA_ERROR_INVALID_IMAGE", ERROR_INVALID_CONTEXT: "CUDA_ERROR_INVALID_CONTEXT", ERROR_CONTEXT_ALREADY_CURRENT: "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", ERROR_MAP_FAILED: "CUDA_ERROR_MAP_FAILED", ERROR_UNMAP_FAILED: "CUDA_ERROR_UNMAP_FAILED", ERROR_ARRAY_IS_MAPPED: "CUDA_ERROR_ARRAY_IS_MAPPED", ERROR_ALREADY_MAPPED: "CUDA_ERROR_ALREADY_MAPPED", ERROR_NO_BINARY_FOR_GPU: "CUDA_ERROR_NO_BINARY_FOR_GPU", ERROR_ALREADY_ACQUIRED: "CUDA_ERROR_ALREADY_ACQUIRED", ERROR_NOT_MAPPED: "CUDA_ERROR_NOT_MAPPED", ERROR_NOT_MAPPED_AS_ARRAY: "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", ERROR_NOT_MAPPED_AS_POINTER: "CUDA_ERROR_NOT_MAPPED_AS_POINTER", ERROR_ECC_UNCORRECTABLE: "CUDA_ERROR_ECC_UNCORRECTABLE", ERROR_UNSUPPORTED_LIMIT: "CUDA_ERROR_UNSUPPORTED_LIMIT", ERROR_CONTEXT_ALREADY_IN_USE: "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", ERROR_INVALID_SOURCE: "CUDA_ERROR_INVALID_SOURCE", ERROR_FILE_NOT_FOUND: "CUDA_ERROR_FILE_NOT_FOUND", ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", ERROR_SHARED_OBJECT_INIT_FAILED: "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", ERROR_OPERATING_SYSTEM: "CUDA_ERROR_OPERATING_SYSTEM", ERROR_INVALID_HANDLE: "CUDA_ERROR_INVALID_HANDLE", ERROR_NOT_FOUND: "CUDA_ERROR_NOT_FOUND", ERROR_NOT_READY: "CUDA_ERROR_NOT_READY", ERROR_LAUNCH_OUT_OF_RESOURCES: "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", ERROR_LAUNCH_TIMEOUT: "CUDA_ERROR_LAUNCH_TIMEOUT", ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", ERROR_PEER_ACCESS_ALREADY_ENABLED: "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", ERROR_PEER_ACCESS_NOT_ENABLED: "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", ERROR_PRIMARY_CONTEXT_ACTIVE: "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", ERROR_CONTEXT_IS_DESTROYED: "CUDA_ERROR_CONTEXT_IS_DESTROYED", ERROR_ASSERT: "CUDA_ERROR_ASSERT", ERROR_TOO_MANY_PEERS: "CUDA_ERROR_TOO_MANY_PEERS", ERROR_HOST_MEMORY_ALREADY_REGISTERED: "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", ERROR_HOST_MEMORY_NOT_REGISTERED: "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", ERROR_HARDWARE_STACK_ERROR: "CUDA_ERROR_HARDWARE_STACK_ERROR", ERROR_ILLEGAL_INSTRUCTION: "CUDA_ERROR_ILLEGAL_INSTRUCTION", ERROR_MISALIGNED_ADDRESS: "CUDA_ERROR_MISALIGNED_ADDRESS", ERROR_INVALID_ADDRESS_SPACE: "CUDA_ERROR_INVALID_ADDRESS_SPACE", ERROR_INVALID_PC: "CUDA_ERROR_INVALID_PC", ERROR_LAUNCH_FAILED: "CUDA_ERROR_LAUNCH_FAILED", ERROR_NOT_PERMITTED: "CUDA_ERROR_NOT_PERMITTED", ERROR_NOT_SUPPORTED: "CUDA_ERROR_NOT_SUPPORTED", ERROR_UNKNOWN: "CUDA_ERROR_UNKNOWN"} 3-3.11.1/cuda/cu/stream.go000066400000000000000000000024751503346766200151300ustar00rootroot00000000000000package cu // This file implements CUDA streams //#include import "C" import "unsafe" // CUDA stream. type Stream uintptr // Creates an asynchronous stream func StreamCreate() Stream { var stream C.CUstream err := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero if err != SUCCESS { panic(err) } return Stream(uintptr(unsafe.Pointer(stream))) } // Destroys the asynchronous stream func (stream *Stream) Destroy() { str := *stream err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str))))) *stream = 0 if err != SUCCESS { panic(err) } } // Destroys an asynchronous stream func StreamDestroy(stream *Stream) { stream.Destroy() } // Blocks until the stream has completed. func (stream Stream) Synchronize() { err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Query() Result { return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream))))) } // Returns Success if all operations have completed, ErrorNotReady otherwise func StreamQuery(stream Stream) Result { return stream.Query() } // Blocks until the stream has completed. func StreamSynchronize(stream Stream) { stream.Synchronize() } 3-3.11.1/cuda/cu/testdata/000077500000000000000000000000001503346766200151075ustar00rootroot000000000000003-3.11.1/cuda/cu/testdata/testmodule.cu000066400000000000000000000006731503346766200176330ustar00rootroot00000000000000/* * Module to test CUDA module loading and execution. * To be compiled with: * nvcc -ptx testmodule.cu */ #ifdef __cplusplus extern "C" { #endif #define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x ) /// Sets the first N elements of array to value. __global__ void testMemset(float* array, float value, int N){ int i = threadindex; if(i < N){ array[i] = value; } } #ifdef __cplusplus } #endif 3-3.11.1/cuda/cu/testdata/testmodule.ptx000066400000000000000000000062551503346766200200410ustar00rootroot00000000000000 .version 1.4 .target sm_10, map_f64_to_f32 // compiled with /usr/local/cuda/open64/lib//be // nvopencc 4.0 built on 2011-02-18 //----------------------------------------------------------- // Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "" .file 2 "/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu" .file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h" .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h" .file 5 "/usr/local/cuda/bin/../include/host_defines.h" .file 6 "/usr/local/cuda/bin/../include/builtin_types.h" .file 7 "/usr/local/cuda/bin/../include/device_types.h" .file 8 "/usr/local/cuda/bin/../include/driver_types.h" .file 9 "/usr/local/cuda/bin/../include/surface_types.h" .file 10 "/usr/local/cuda/bin/../include/texture_types.h" .file 11 "/usr/local/cuda/bin/../include/vector_types.h" .file 12 "/usr/local/cuda/bin/../include/device_launch_parameters.h" .file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h" .file 14 "/usr/include/bits/types.h" .file 15 "/usr/include/time.h" .file 16 "testmodule.cu" .file 17 "/usr/local/cuda/bin/../include/common_functions.h" .file 18 "/usr/local/cuda/bin/../include/math_functions.h" .file 19 "/usr/local/cuda/bin/../include/math_constants.h" .file 20 "/usr/local/cuda/bin/../include/device_functions.h" .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" .file 24 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h" .file 25 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h" .file 26 "/usr/local/cuda/bin/../include/surface_functions.h" .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h" .entry testMemset ( .param .u64 __cudaparm_testMemset_array, .param .f32 __cudaparm_testMemset_value, .param .s32 __cudaparm_testMemset_N) { .reg .u16 %rh<4>; .reg .u32 %r<10>; .reg .u64 %rd<6>; .reg .f32 %f<3>; .reg .pred %p<3>; .loc 16 7 0 $LDWbegin_testMemset: mov.u16 %rh1, %nctaid.x; mov.u16 %rh2, %ctaid.y; mul.wide.u16 %r1, %rh1, %rh2; cvt.u32.u16 %r2, %ctaid.x; add.u32 %r3, %r2, %r1; cvt.u32.u16 %r4, %ntid.x; mul.lo.u32 %r5, %r4, %r3; cvt.u32.u16 %r6, %tid.x; add.u32 %r7, %r6, %r5; ld.param.s32 %r8, [__cudaparm_testMemset_N]; setp.le.s32 %p1, %r8, %r7; @%p1 bra $Lt_0_1026; .loc 16 10 0 ld.param.f32 %f1, [__cudaparm_testMemset_value]; ld.param.u64 %rd1, [__cudaparm_testMemset_array]; cvt.s64.s32 %rd2, %r7; mul.wide.s32 %rd3, %r7, 4; add.u64 %rd4, %rd1, %rd3; st.global.f32 [%rd4+0], %f1; $Lt_0_1026: .loc 16 12 0 exit; $LDWend_testMemset: } // testMemset 3-3.11.1/cuda/cu/version.go000066400000000000000000000005001503346766200153050ustar00rootroot00000000000000package cu // This file implements CUDA driver version management //#include import "C" const CUDA_VERSION = C.CUDA_VERSION // Returns the CUDA driver version. func Version() int { var version C.int err := Result(C.cuDriverGetVersion(&version)) if err != SUCCESS { panic(err) } return int(version) } 3-3.11.1/cuda/cu/version_test.go000066400000000000000000000001761503346766200163550ustar00rootroot00000000000000package cu import ( "fmt" "testing" ) func TestVersion(t *testing.T) { fmt.Println("CUDA driver version: ", Version()) } 3-3.11.1/cuda/cubicanisotropy2.cu000066400000000000000000000051231503346766200165200ustar00rootroot00000000000000#include "amul.h" #include "float3.h" #include // add cubic anisotropy field to B. // B: effective field in T // m: reduced magnetization (unit length) // Ms: saturation magnetization in A/m. // K1: Kc1 in J/m3 // K2: Kc2 in T/m3 // C1, C2: anisotropy axes // // based on http://www.southampton.ac.uk/~fangohr/software/oxs_cubic8.html extern "C" __global__ void addcubicanisotropy2(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ k1_, float k1_mul, float* __restrict__ k2_, float k2_mul, float* __restrict__ k3_, float k3_mul, float* __restrict__ c1x_, float c1x_mul, float* __restrict__ c1y_, float c1y_mul, float* __restrict__ c1z_, float c1z_mul, float* __restrict__ c2x_, float c2x_mul, float* __restrict__ c2y_, float c2y_mul, float* __restrict__ c2z_, float c2z_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float invMs = inv_Msat(Ms_, Ms_mul, i); float k1 = amul(k1_, k1_mul, i) * invMs; float k2 = amul(k2_, k2_mul, i) * invMs; float k3 = amul(k3_, k3_mul, i) * invMs; float3 u1 = normalized(vmul(c1x_, c1y_, c1z_, c1x_mul, c1y_mul, c1z_mul, i)); float3 u2 = normalized(vmul(c2x_, c2y_, c2z_, c2x_mul, c2y_mul, c2z_mul, i)); float3 u3 = cross(u1, u2); // 3rd axis perpendicular to u1,u2 float3 m = make_float3(mx[i], my[i], mz[i]); float u1m = dot(u1, m); float u2m = dot(u2, m); float u3m = dot(u3, m); float3 B = -2.0f*k1*((pow2(u2m) + pow2(u3m)) * ( (u1m) * u1) + (pow2(u1m) + pow2(u3m)) * ( (u2m) * u2) + (pow2(u1m) + pow2(u2m)) * ( (u3m) * u3))- 2.0f*k2*((pow2(u2m) * pow2(u3m)) * ( (u1m) * u1) + (pow2(u1m) * pow2(u3m)) * ( (u2m) * u2) + (pow2(u1m) * pow2(u2m)) * ( (u3m) * u3))- 4.0f*k3*((pow4(u2m) + pow4(u3m)) * (pow3(u1m) * u1) + (pow4(u1m) + pow4(u3m)) * (pow3(u2m) * u2) + (pow4(u1m) + pow4(u2m)) * (pow3(u3m) * u3)); Bx[i] += B.x; By[i] += B.y; Bz[i] += B.z; } } 3-3.11.1/cuda/cubicanisotropy2_wrapper.go000066400000000000000000004446521503346766200202740ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addcubicanisotropy2 kernel var addcubicanisotropy2_code cu.Function // Stores the arguments for addcubicanisotropy2 kernel invocation type addcubicanisotropy2_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_k1_ unsafe.Pointer arg_k1_mul float32 arg_k2_ unsafe.Pointer arg_k2_mul float32 arg_k3_ unsafe.Pointer arg_k3_mul float32 arg_c1x_ unsafe.Pointer arg_c1x_mul float32 arg_c1y_ unsafe.Pointer arg_c1y_mul float32 arg_c1z_ unsafe.Pointer arg_c1z_mul float32 arg_c2x_ unsafe.Pointer arg_c2x_mul float32 arg_c2y_ unsafe.Pointer arg_c2y_mul float32 arg_c2z_ unsafe.Pointer arg_c2z_mul float32 arg_N int argptr [27]unsafe.Pointer sync.Mutex } // Stores the arguments for addcubicanisotropy2 kernel invocation var addcubicanisotropy2_args addcubicanisotropy2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addcubicanisotropy2_args.argptr[0] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bx) addcubicanisotropy2_args.argptr[1] = unsafe.Pointer(&addcubicanisotropy2_args.arg_By) addcubicanisotropy2_args.argptr[2] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bz) addcubicanisotropy2_args.argptr[3] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mx) addcubicanisotropy2_args.argptr[4] = unsafe.Pointer(&addcubicanisotropy2_args.arg_my) addcubicanisotropy2_args.argptr[5] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mz) addcubicanisotropy2_args.argptr[6] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_) addcubicanisotropy2_args.argptr[7] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_mul) addcubicanisotropy2_args.argptr[8] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_) addcubicanisotropy2_args.argptr[9] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_mul) addcubicanisotropy2_args.argptr[10] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_) addcubicanisotropy2_args.argptr[11] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_mul) addcubicanisotropy2_args.argptr[12] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_) addcubicanisotropy2_args.argptr[13] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_mul) addcubicanisotropy2_args.argptr[14] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_) addcubicanisotropy2_args.argptr[15] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_mul) addcubicanisotropy2_args.argptr[16] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_) addcubicanisotropy2_args.argptr[17] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_mul) addcubicanisotropy2_args.argptr[18] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_) addcubicanisotropy2_args.argptr[19] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_mul) addcubicanisotropy2_args.argptr[20] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_) addcubicanisotropy2_args.argptr[21] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_mul) addcubicanisotropy2_args.argptr[22] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_) addcubicanisotropy2_args.argptr[23] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_mul) addcubicanisotropy2_args.argptr[24] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_) addcubicanisotropy2_args.argptr[25] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_mul) addcubicanisotropy2_args.argptr[26] = unsafe.Pointer(&addcubicanisotropy2_args.arg_N) } // Wrapper for addcubicanisotropy2 CUDA kernel, asynchronous. func k_addcubicanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, k1_ unsafe.Pointer, k1_mul float32, k2_ unsafe.Pointer, k2_mul float32, k3_ unsafe.Pointer, k3_mul float32, c1x_ unsafe.Pointer, c1x_mul float32, c1y_ unsafe.Pointer, c1y_mul float32, c1z_ unsafe.Pointer, c1z_mul float32, c2x_ unsafe.Pointer, c2x_mul float32, c2y_ unsafe.Pointer, c2y_mul float32, c2z_ unsafe.Pointer, c2z_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("addcubicanisotropy2") } addcubicanisotropy2_args.Lock() defer addcubicanisotropy2_args.Unlock() if addcubicanisotropy2_code == 0 { addcubicanisotropy2_code = fatbinLoad(addcubicanisotropy2_map, "addcubicanisotropy2") } addcubicanisotropy2_args.arg_Bx = Bx addcubicanisotropy2_args.arg_By = By addcubicanisotropy2_args.arg_Bz = Bz addcubicanisotropy2_args.arg_mx = mx addcubicanisotropy2_args.arg_my = my addcubicanisotropy2_args.arg_mz = mz addcubicanisotropy2_args.arg_Ms_ = Ms_ addcubicanisotropy2_args.arg_Ms_mul = Ms_mul addcubicanisotropy2_args.arg_k1_ = k1_ addcubicanisotropy2_args.arg_k1_mul = k1_mul addcubicanisotropy2_args.arg_k2_ = k2_ addcubicanisotropy2_args.arg_k2_mul = k2_mul addcubicanisotropy2_args.arg_k3_ = k3_ addcubicanisotropy2_args.arg_k3_mul = k3_mul addcubicanisotropy2_args.arg_c1x_ = c1x_ addcubicanisotropy2_args.arg_c1x_mul = c1x_mul addcubicanisotropy2_args.arg_c1y_ = c1y_ addcubicanisotropy2_args.arg_c1y_mul = c1y_mul addcubicanisotropy2_args.arg_c1z_ = c1z_ addcubicanisotropy2_args.arg_c1z_mul = c1z_mul addcubicanisotropy2_args.arg_c2x_ = c2x_ addcubicanisotropy2_args.arg_c2x_mul = c2x_mul addcubicanisotropy2_args.arg_c2y_ = c2y_ addcubicanisotropy2_args.arg_c2y_mul = c2y_mul addcubicanisotropy2_args.arg_c2z_ = c2z_ addcubicanisotropy2_args.arg_c2z_mul = c2z_mul addcubicanisotropy2_args.arg_N = N args := addcubicanisotropy2_args.argptr[:] cu.LaunchKernel(addcubicanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addcubicanisotropy2") } } // maps compute capability on PTX code for addcubicanisotropy2 kernel. var addcubicanisotropy2_map = map[int]string{0: "", 50: addcubicanisotropy2_ptx_50, 52: addcubicanisotropy2_ptx_52, 53: addcubicanisotropy2_ptx_53, 60: addcubicanisotropy2_ptx_60, 61: addcubicanisotropy2_ptx_61, 62: addcubicanisotropy2_ptx_62, 70: addcubicanisotropy2_ptx_70, 72: addcubicanisotropy2_ptx_72, 75: addcubicanisotropy2_ptx_75, 80: addcubicanisotropy2_ptx_80, 86: addcubicanisotropy2_ptx_86, 87: addcubicanisotropy2_ptx_87, 89: addcubicanisotropy2_ptx_89, 90: addcubicanisotropy2_ptx_90} // addcubicanisotropy2 PTX code for various compute capabilities. const ( addcubicanisotropy2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` addcubicanisotropy2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<9>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r2, [addcubicanisotropy2_param_26]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; $L__BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f175, %f174; $L__BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r1, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; $L__BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r1, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; $L__BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r1, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; $L__BB0_11: mul.f32 %f11, %f175, %f178; setp.eq.s64 %p7, %rd11, 0; mul.f32 %f12, %f175, %f176; mul.f32 %f13, %f175, %f177; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r1, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; $L__BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r1, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; $L__BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r1, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; $L__BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; setp.eq.f32 %p10, %f20, 0f00000000; mov.f32 %f182, 0f00000000; @%p10 bra $L__BB0_19; rcp.rn.f32 %f182, %f20; $L__BB0_19: mul.f32 %f23, %f179, %f182; mul.f32 %f24, %f180, %f182; mul.f32 %f25, %f181, %f182; setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r1, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; $L__BB0_21: setp.eq.s64 %p12, %rd15, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r1, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; $L__BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra $L__BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r1, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; $L__BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; setp.eq.f32 %p14, %f32, 0f00000000; mov.f32 %f186, 0f00000000; @%p14 bra $L__BB0_27; rcp.rn.f32 %f186, %f32; $L__BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f24, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f25, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f25, %f69; mul.f32 %f71, %f23, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f23, %f66; mul.f32 %f74, %f24, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r1, 4; add.s64 %rd49, %rd47, %rd48; ld.global.nc.f32 %f76, [%rd49]; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f24, %f77; fma.rn.f32 %f79, %f23, %f76, %f78; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f25, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f23, %f81; mul.f32 %f92, %f24, %f81; mul.f32 %f93, %f25, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f106, %f105, %f102; fma.rn.f32 %f110, %f107, %f105, %f103; fma.rn.f32 %f111, %f108, %f105, %f104; mul.f32 %f112, %f12, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f106, %f124, %f121; fma.rn.f32 %f126, %f107, %f124, %f122; fma.rn.f32 %f127, %f108, %f124, %f123; add.f32 %f128, %f13, %f13; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f23, %f138; mul.f32 %f140, %f24, %f138; mul.f32 %f141, %f25, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f11, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; $L__BB0_28: ret; } ` ) 3-3.11.1/cuda/cuda2go.go000066400000000000000000000127341503346766200145510ustar00rootroot00000000000000//go:build ignore // +build ignore // This program generates Go wrappers for cuda sources. // The cuda file should contain exactly one __global__ void. package main import ( "bufio" "bytes" "flag" "fmt" "io" "log" "os" "regexp" "strconv" "text/scanner" "text/template" "github.com/mumax/3/util" ) func main() { flag.Parse() for _, fname := range flag.Args() { cuda2go(fname) } } // generate cuda wrapper for file. func cuda2go(fname string) { // open cuda file f, err := os.Open(fname) util.PanicErr(err) defer f.Close() // read tokens var token []string var s scanner.Scanner s.Init(f) tok := s.Scan() for tok != scanner.EOF { if !filter(s.TokenText()) { token = append(token, s.TokenText()) } tok = s.Scan() } // find function name and arguments funcname := "" argstart, argstop := -1, -1 for i := 0; i < len(token); i++ { if token[i] == "__global__" { funcname = token[i+2] argstart = i + 4 } if argstart > 0 && token[i] == ")" { argstop = i + 1 break } } argl := token[argstart:argstop] // isolate individual arguments var args [][]string start := 0 for i, a := range argl { if a == "," || a == ")" { args = append(args, argl[start:i]) start = i + 1 } } // separate arg names/types and make pointers Go-style argn := make([]string, len(args)) argt := make([]string, len(args)) for i := range args { if args[i][1] == "*" { args[i] = []string{args[i][0] + "*", args[i][2]} } argt[i] = typemap(args[i][0]) argn[i] = args[i][1] } wrapgen(fname, funcname, argt, argn) } // translate C type to Go type. func typemap(ctype string) string { if gotype, ok := tm[ctype]; ok { return gotype } panic(fmt.Errorf("unsupported cuda type: %v", ctype)) } var tm = map[string]string{"float*": "unsafe.Pointer", "float": "float32", "int": "int", "uint8_t*": "unsafe.Pointer", "uint8_t": "byte"} // template data type Kernel struct { Name string ArgT []string ArgN []string PTX map[int]string } var ls []string // generate wrapper code from template func wrapgen(filename, funcname string, argt, argn []string) { kernel := &Kernel{funcname, argt, argn, make(map[int]string)} // find corresponding .PTX files if ls == nil { dir, errd := os.Open(".") defer dir.Close() util.PanicErr(errd) var errls error ls, errls = dir.Readdirnames(-1) util.PanicErr(errls) } basename := util.NoExt(filename) for _, f := range ls { match, e := regexp.MatchString(`^`+basename+`_[0-9]+\.ptx`, f) util.PanicErr(e) if match { loc := regexp.MustCompile(`_[0-9]+\.ptx`).FindStringIndex(f) // Start and end indices of "_CC.ptx" cc, ei := strconv.Atoi(f[loc[0]+1 : loc[1]-len(".ptx")]) util.PanicErr(ei) fmt.Println(basename, cc) kernel.PTX[cc] = filterptx(f) } } if len(kernel.PTX) == 0 { log.Fatal("no PTX files for ", filename) } wrapfname := basename + "_wrapper.go" wrapout, err := os.OpenFile(wrapfname, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) util.PanicErr(err) defer wrapout.Close() util.PanicErr(templ.Execute(wrapout, kernel)) } // wrapper code template text const templText = `package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import( "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" ) // CUDA handle for {{.Name}} kernel var {{.Name}}_code cu.Function // Stores the arguments for {{.Name}} kernel invocation type {{.Name}}_args_t struct{ {{range $i, $_ := .ArgN}} arg_{{.}} {{index $.ArgT $i}} {{end}} argptr [{{len .ArgN}}]unsafe.Pointer sync.Mutex } // Stores the arguments for {{.Name}} kernel invocation var {{.Name}}_args {{.Name}}_args_t func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. {{range $i, $t := .ArgN}} {{$.Name}}_args.argptr[{{$i}}] = unsafe.Pointer(&{{$.Name}}_args.arg_{{.}}) {{end}} } // Wrapper for {{.Name}} CUDA kernel, asynchronous. func k_{{.Name}}_async ( {{range $i, $t := .ArgT}}{{index $.ArgN $i}} {{$t}}, {{end}} cfg *config) { if Synchronous{ // debug Sync() timer.Start("{{.Name}}") } {{.Name}}_args.Lock() defer {{.Name}}_args.Unlock() if {{.Name}}_code == 0{ {{.Name}}_code = fatbinLoad({{.Name}}_map, "{{.Name}}") } {{range $i, $t := .ArgN}} {{$.Name}}_args.arg_{{.}} = {{.}} {{end}} args := {{.Name}}_args.argptr[:] cu.LaunchKernel({{.Name}}_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous{ // debug Sync() timer.Stop("{{.Name}}") } } // maps compute capability on PTX code for {{.Name}} kernel. var {{.Name}}_map = map[int]string{ 0: "" {{range $k, $v := .PTX}}, {{$k}}: {{$.Name}}_ptx_{{$k}} {{end}} } // {{.Name}} PTX code for various compute capabilities. const( {{range $k, $v := .PTX}} {{$.Name}}_ptx_{{$k}} = {{$v}} {{end}}) ` // wrapper code template var templ = template.Must(template.New("wrap").Parse(templText)) // should token be filtered out of stream? func filter(token string) bool { switch token { case "__restrict__": return true } return false } // Filter comments and ".file" entries from ptx code. // They spoil the git history. func filterptx(fname string) string { f, err := os.Open(fname) util.PanicErr(err) defer f.Close() in := bufio.NewReader(f) var out bytes.Buffer out.Write(([]byte)("`")) line, err := in.ReadBytes('\n') for err != io.EOF { util.PanicErr(err) if !bytes.HasPrefix(line, []byte("//")) && !bytes.HasPrefix(line, []byte(" .file")) { out.Write(line) } line, err = in.ReadBytes('\n') } out.Write(([]byte)("`")) return out.String() } 3-3.11.1/cuda/cufft/000077500000000000000000000000001503346766200137765ustar00rootroot000000000000003-3.11.1/cuda/cufft/Makefile000066400000000000000000000006401503346766200154360ustar00rootroot00000000000000all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/cufft > README 3-3.11.1/cuda/cufft/README000066400000000000000000000056461503346766200146710ustar00rootroot00000000000000PACKAGE DOCUMENTATION package cufft import "github.com/barnex/cuda5/cufft" Go bindings for the CUDA CUFFT API. CONSTANTS const ( FORWARD = -1 // Forward FFT INVERSE = 1 // Inverse FFT ) TYPES type CompatibilityMode int CUFFT compatibility mode const ( COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING ) func (t CompatibilityMode) String() string type Handle uintptr FFT plan handle, reference type to a plan func Plan1d(nx int, typ Type, batch int) Handle 1D FFT plan func Plan2d(nx, ny int, typ Type) Handle 2D FFT plan func Plan3d(nx, ny, nz int, typ Type) Handle 3D FFT plan func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle 1D,2D or 3D FFT plan func (plan *Handle) Destroy() Destroys the plan. func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) Execute Complex-to-Complex plan func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) Execute Complex-to-Real plan func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) Execute Double Real-to-Complex plan func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) Execute Real-to-Complex plan func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) Execute Double Complex-to-Real plan func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) Execute Double Complex-to-Complex plan func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) Sets the FFTW compatibility mode func (plan Handle) SetStream(stream cu.Stream) Sets the cuda stream for this plan type Result int FFT result const ( SUCCESS Result = C.CUFFT_SUCCESS INVALID_PLAN Result = C.CUFFT_INVALID_PLAN ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED INVALID_TYPE Result = C.CUFFT_INVALID_TYPE INVALID_VALUE Result = C.CUFFT_INVALID_VALUE INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR EXEC_FAILED Result = C.CUFFT_EXEC_FAILED SETUP_FAILED Result = C.CUFFT_SETUP_FAILED INVALID_SIZE Result = C.CUFFT_INVALID_SIZE UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h INVALID_DEVICE Result = 0xB PARSE_ERROR Result = 0xC NO_WORKSPACE Result = 0xD ) FFT result value func (r Result) String() string type Type int FFT type const ( R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved D2Z Type = C.CUFFT_D2Z // Double to Double-Complex Z2D Type = C.CUFFT_Z2D // Double-Complex to Double Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex ) func (t Type) String() string 3-3.11.1/cuda/cufft/cgoflags.go000066400000000000000000000010411503346766200161060ustar00rootroot00000000000000package cufft // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcufft // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////Ubuntu 15.04: //#cgo LDFLAGS:-L/usr/lib/x86_64-linux-gnu/ //#cgo CFLAGS: -I/usr/include // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/include -w import "C" 3-3.11.1/cuda/cufft/doc.go000066400000000000000000000000651503346766200150730ustar00rootroot00000000000000// Go bindings for the CUDA CUFFT API. package cufft 3-3.11.1/cuda/cufft/fft_test.go000066400000000000000000000014441503346766200161460ustar00rootroot00000000000000package cufft import ( "fmt" "testing" "unsafe" "github.com/mumax/3/cuda/cu" ) func TestExampleFFT1D(t *testing.T) { N := 8 hostIn := make([]float32, N) hostIn[0] = 1 devIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32) defer cu.MemFree(devIn) cu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes()) hostOut := make([]complex64, N/2+1) devOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64) defer cu.MemFree(devOut) plan := Plan1d(N, R2C, 1) defer plan.Destroy() plan.ExecR2C(devIn, devOut) cu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes()) fmt.Println("hostIn:", hostIn) fmt.Println("hostOut:", hostOut) for i := 0; i < N; i++ { if hostOut[0] != 1+0i { t.Errorf("hostOut[%d]: got %f, want %f", i, hostOut[0], 1+0i) } } } 3-3.11.1/cuda/cufft/init_test.go000066400000000000000000000003421503346766200163260ustar00rootroot00000000000000package cufft import ( "fmt" "github.com/mumax/3/cuda/cu" ) // needed for all other tests. func init() { cu.Init(0) ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0) cu.CtxSetCurrent(ctx) fmt.Println("Created CUDA context") } 3-3.11.1/cuda/cufft/mode.go000066400000000000000000000010571503346766200152540ustar00rootroot00000000000000package cufft //#include import "C" import ( "fmt" ) // CUFFT compatibility mode type CompatibilityMode int const ( COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING ) func (t CompatibilityMode) String() string { if str, ok := compatibilityModeString[t]; ok { return str } return fmt.Sprint("CUFFT Compatibility mode with unknown number:", int(t)) } var compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{ COMPATIBILITY_FFTW_PADDING: "CUFFT_COMPATIBILITY_FFTW_PADDING"} 3-3.11.1/cuda/cufft/plan.go000066400000000000000000000103541503346766200152620ustar00rootroot00000000000000// Copyright 2011 Arne Vansteenkiste (barnex@gmail.com). All rights reserved. // Use of this source code is governed by a freeBSD // license that can be found in the LICENSE.txt file. package cufft //#include import "C" import ( "unsafe" "github.com/mumax/3/cuda/cu" ) // FFT plan handle, reference type to a plan type Handle uintptr // 1D FFT plan func Plan1d(nx int, typ Type, batch int) Handle { var handle C.cufftHandle err := Result(C.cufftPlan1d( &handle, C.int(nx), C.cufftType(typ), C.int(batch))) if err != SUCCESS { panic(err) } return Handle(handle) } // 2D FFT plan func Plan2d(nx, ny int, typ Type) Handle { var handle C.cufftHandle err := Result(C.cufftPlan2d( &handle, C.int(nx), C.int(ny), C.cufftType(typ))) if err != SUCCESS { panic(err) } return Handle(handle) } // 3D FFT plan func Plan3d(nx, ny, nz int, typ Type) Handle { var handle C.cufftHandle err := Result(C.cufftPlan3d( &handle, C.int(nx), C.int(ny), C.int(nz), C.cufftType(typ))) if err != SUCCESS { panic(err) } return Handle(handle) } //cufftPlanMany( // cufftHandle *plan, int rank, int *n, int *inembed, // int istride, int idist, int *onembed, int ostride, // int odist, cufftType type, int batch ); // 1D,2D or 3D FFT plan func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle { var handle C.cufftHandle NULL := (*C.int)(unsafe.Pointer(uintptr(0))) inembedptr := NULL idist := 0 if inembed != nil { inembedptr = (*C.int)(unsafe.Pointer(&inembed[0])) idist = inembed[0] } oembedptr := NULL odist := 0 if oembed != nil { oembedptr = (*C.int)(unsafe.Pointer(&oembed[0])) odist = oembed[0] } err := Result(C.cufftPlanMany( &handle, C.int(len(n)), // rank (*C.int)(unsafe.Pointer(&n[0])), // n inembedptr, C.int(istride), C.int(idist), oembedptr, C.int(ostride), C.int(odist), C.cufftType(typ), C.int(batch))) if err != SUCCESS { panic(err) } return Handle(handle) } // Execute Complex-to-Complex plan func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) { err := Result(C.cufftExecC2C( C.cufftHandle(plan), (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))), C.int(direction))) if err != SUCCESS { panic(err) } } // Execute Real-to-Complex plan func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) { err := Result(C.cufftExecR2C( C.cufftHandle(plan), (*C.cufftReal)(unsafe.Pointer(uintptr(idata))), (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Complex-to-Real plan func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) { err := Result(C.cufftExecC2R( C.cufftHandle(plan), (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftReal)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Double Complex-to-Complex plan func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) { err := Result(C.cufftExecZ2Z( C.cufftHandle(plan), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))), C.int(direction))) if err != SUCCESS { panic(err) } } // Execute Double Real-to-Complex plan func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) { err := Result(C.cufftExecD2Z( C.cufftHandle(plan), (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Double Complex-to-Real plan func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) { err := Result(C.cufftExecZ2D( C.cufftHandle(plan), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Destroys the plan. func (plan *Handle) Destroy() { err := Result(C.cufftDestroy(C.cufftHandle(*plan))) *plan = 0 // make sure plan is not used anymore if err != SUCCESS { panic(err) } } // Sets the cuda stream for this plan func (plan Handle) SetStream(stream cu.Stream) { err := Result(C.cufftSetStream( C.cufftHandle(plan), C.cudaStream_t(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } 3-3.11.1/cuda/cufft/result.go000066400000000000000000000034671503346766200156550ustar00rootroot00000000000000package cufft //#include import "C" import ( "fmt" ) // FFT result type Result int // FFT result value const ( SUCCESS Result = C.CUFFT_SUCCESS INVALID_PLAN Result = C.CUFFT_INVALID_PLAN ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED INVALID_TYPE Result = C.CUFFT_INVALID_TYPE INVALID_VALUE Result = C.CUFFT_INVALID_VALUE INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR EXEC_FAILED Result = C.CUFFT_EXEC_FAILED SETUP_FAILED Result = C.CUFFT_SETUP_FAILED INVALID_SIZE Result = C.CUFFT_INVALID_SIZE UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h INVALID_DEVICE Result = 0xB PARSE_ERROR Result = 0xC NO_WORKSPACE Result = 0xD ) func (r Result) String() string { if str, ok := resultString[r]; ok { return str } return fmt.Sprint("CUFFT Result with unknown error number:", int(r)) } var resultString map[Result]string = map[Result]string{ SUCCESS: "CUFFT_SUCCESS", INVALID_PLAN: "CUFFT_INVALID_PLAN", ALLOC_FAILED: "CUFFT_ALLOC_FAILED", INVALID_TYPE: "CUFFT_INVALID_TYPE", INVALID_VALUE: "CUFFT_INVALID_VALUE", INTERNAL_ERROR: "CUFFT_INTERNAL_ERROR", EXEC_FAILED: "CUFFT_EXEC_FAILED", SETUP_FAILED: "CUFFT_SETUP_FAILED", INVALID_SIZE: "CUFFT_INVALID_SIZE", UNALIGNED_DATA: "CUFFT_UNALIGNED_DATA", INCOMPLETE_PARAMETER_LIST: "CUFFT_INCOMPLETE_PARAMETER_LIST", INVALID_DEVICE: "CUFFT_INVALID_DEVICE", PARSE_ERROR: "CUFFT_PARSE_ERROR", NO_WORKSPACE: "CUFFT_NO_WORKSPACE"} 3-3.11.1/cuda/cufft/type.go000066400000000000000000000014671503346766200153160ustar00rootroot00000000000000package cufft //#include import "C" import ( "fmt" ) // FFT type type Type int const ( R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved D2Z Type = C.CUFFT_D2Z // Double to Double-Complex Z2D Type = C.CUFFT_Z2D // Double-Complex to Double Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex ) const ( FORWARD = -1 // Forward FFT INVERSE = 1 // Inverse FFT ) func (t Type) String() string { if str, ok := typeString[t]; ok { return str } return fmt.Sprint("CUFFT Type with unknown number:", int(t)) } var typeString map[Type]string = map[Type]string{ R2C: "CUFFT_R2C", C2R: "CUFFT_C2R", C2C: "CUFFT_C2C", D2Z: "CUFFT_D2Z", Z2D: "CUFFT_Z2D", Z2Z: "CUFFT_Z2Z"} 3-3.11.1/cuda/curand/000077500000000000000000000000001503346766200141435ustar00rootroot000000000000003-3.11.1/cuda/curand/Makefile000066400000000000000000000006171503346766200156070ustar00rootroot00000000000000all: 6g gccgo doc 6g: go install -v gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/curand > README 3-3.11.1/cuda/curand/README000066400000000000000000000045651503346766200150350ustar00rootroot00000000000000PACKAGE DOCUMENTATION package curand import "github.com/barnex/cuda5/curand" TYPES type Generator uintptr func CreateGenerator(rngType RngType) Generator func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) func (g Generator) SetSeed(seed int64) type RngType int const ( PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator ) type Status int const ( SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error ) 3-3.11.1/cuda/curand/cgoflags.go000066400000000000000000000010431503346766200162550ustar00rootroot00000000000000package curand // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcurand // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////Ubuntu 15.04: //#cgo LDFLAGS:-L/usr/lib/x86_64-linux-gnu/ //#cgo CFLAGS: -I/usr/include // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/include -w import "C" 3-3.11.1/cuda/curand/generator.go000066400000000000000000000032261503346766200164630ustar00rootroot00000000000000package curand //#include import "C" import ( "unsafe" ) type Generator uintptr type RngType int const ( PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator ) func CreateGenerator(rngType RngType) Generator { var rng C.curandGenerator_t err := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType))) if err != SUCCESS { panic(err) } return Generator(uintptr(unsafe.Pointer(rng))) // cgo } func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) { err := Status(C.curandGenerateNormal( C.curandGenerator_t(unsafe.Pointer(uintptr(g))), (*C.float)(unsafe.Pointer(output)), C.size_t(n), C.float(mean), C.float(stddev))) if err != SUCCESS { panic(err) } } func (g Generator) SetSeed(seed int64) { err := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), C.ulonglong(seed))) if err != SUCCESS { panic(err) } } // Documentation was taken from the curand headers. 3-3.11.1/cuda/curand/status.go000066400000000000000000000043411503346766200160170ustar00rootroot00000000000000package curand //#include import "C" import ( "fmt" ) type Status int const ( SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error ) func (s Status) String() string { if str, ok := statusStr[s]; ok { return str } else { return fmt.Sprint("CURAND ERROR NUMBER ", int(s)) } } var statusStr = map[Status]string{ SUCCESS: "CURAND_STATUS_SUCCESS", VERSION_MISMATCH: "CURAND_STATUS_VERSION_MISMATCH", NOT_INITIALIZED: "CURAND_STATUS_NOT_INITIALIZED", ALLOCATION_FAILED: "CURAND_STATUS_ALLOCATION_FAILED", TYPE_ERROR: "CURAND_STATUS_TYPE_ERROR", OUT_OF_RANGE: "CURAND_STATUS_OUT_OF_RANGE", LENGTH_NOT_MULTIPLE: "CURAND_STATUS_LENGTH_NOT_MULTIPLE", LAUNCH_FAILURE: "CURAND_STATUS_LAUNCH_FAILURE", PREEXISTING_FAILURE: "CURAND_STATUS_PREEXISTING_FAILURE", INITIALIZATION_FAILED: "CURAND_STATUS_INITIALIZATION_FAILED", ARCH_MISMATCH: "CURAND_STATUS_ARCH_MISMATCH", INTERNAL_ERROR: "CURAND_STATUS_INTERNAL_ERROR", } // Documentation was taken from the curand headers. 3-3.11.1/cuda/div.cu000066400000000000000000000005611503346766200140040ustar00rootroot00000000000000// dst[i] = a[i] / b[i] extern "C" __global__ void pointwise_div(float* __restrict__ dst, float* __restrict__ a, float* __restrict__ b, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { if (b[i] != 0.0f) { dst[i] = a[i] / b[i]; } else { dst[i] = 0.0f; } } } 3-3.11.1/cuda/div_wrapper.go000066400000000000000000000516441503346766200155520ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for pointwise_div kernel var pointwise_div_code cu.Function // Stores the arguments for pointwise_div kernel invocation type pointwise_div_args_t struct { arg_dst unsafe.Pointer arg_a unsafe.Pointer arg_b unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for pointwise_div kernel invocation var pointwise_div_args pointwise_div_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. pointwise_div_args.argptr[0] = unsafe.Pointer(&pointwise_div_args.arg_dst) pointwise_div_args.argptr[1] = unsafe.Pointer(&pointwise_div_args.arg_a) pointwise_div_args.argptr[2] = unsafe.Pointer(&pointwise_div_args.arg_b) pointwise_div_args.argptr[3] = unsafe.Pointer(&pointwise_div_args.arg_N) } // Wrapper for pointwise_div CUDA kernel, asynchronous. func k_pointwise_div_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("pointwise_div") } pointwise_div_args.Lock() defer pointwise_div_args.Unlock() if pointwise_div_code == 0 { pointwise_div_code = fatbinLoad(pointwise_div_map, "pointwise_div") } pointwise_div_args.arg_dst = dst pointwise_div_args.arg_a = a pointwise_div_args.arg_b = b pointwise_div_args.arg_N = N args := pointwise_div_args.argptr[:] cu.LaunchKernel(pointwise_div_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("pointwise_div") } } // maps compute capability on PTX code for pointwise_div kernel. var pointwise_div_map = map[int]string{0: "", 50: pointwise_div_ptx_50, 52: pointwise_div_ptx_52, 53: pointwise_div_ptx_53, 60: pointwise_div_ptx_60, 61: pointwise_div_ptx_61, 62: pointwise_div_ptx_62, 70: pointwise_div_ptx_70, 72: pointwise_div_ptx_72, 75: pointwise_div_ptx_75, 80: pointwise_div_ptx_80, 86: pointwise_div_ptx_86, 87: pointwise_div_ptx_87, 89: pointwise_div_ptx_89, 90: pointwise_div_ptx_90} // pointwise_div PTX code for various compute capabilities. const ( pointwise_div_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` pointwise_div_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd3, [pointwise_div_param_0]; ld.param.u64 %rd4, [pointwise_div_param_1]; ld.param.u64 %rd5, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd6, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f1, [%rd8]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd2, %rd9, %rd7; @%p2 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f2, [%rd12]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd2], %f3; bra.uni $L__BB0_4; $L__BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd2], %r9; $L__BB0_4: ret; } ` ) 3-3.11.1/cuda/dmi.cu000066400000000000000000000170531503346766200137770ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" #include "amul.h" // Exchange + Dzyaloshinskii-Moriya interaction according to // Bagdanov and Röβler, PRL 87, 3, 2001. eq.8 (out-of-plane symmetry breaking). // Taking into account proper boundary conditions. // m: normalized magnetization // H: effective field in Tesla // D: dmi strength, in Tesla*m // A: Aex extern "C" __global__ void adddmi(float* __restrict__ Hx, float* __restrict__ Hy, float* __restrict__ Hz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ aLUT2d, float* __restrict__ dLUT2d, uint8_t* __restrict__ regions, float cx, float cy, float cz, int Nx, int Ny, int Nz, uint8_t PBC, uint8_t OpenBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 h = make_float3(0.0,0.0,0.0); // add to H float3 m0 = make_float3(mx[I], my[I], mz[I]); // central m uint8_t r0 = regions[I]; int i_; // neighbor index if(is0(m0)) { return; } // x derivatives (along length) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; // don't use inter region params if m1=0 float A1 = aLUT2d[symidx(r0, r1)]; // inter-region Aex float D1 = dLUT2d[symidx(r0, r1)]; // inter-region Dex if (!is0(m1) || !OpenBC){ // do nothing at an open boundary if (is0(m1)) { // neighbor missing m1.x = m0.x - (-cx * (0.5f*D1/A1) * m0.z); // extrapolate missing m from Neumann BC's m1.y = m0.y; m1.z = m0.z + (-cx * (0.5f*D1/A1) * m0.x); } h += (2.0f*A1/(cx*cx)) * (m1 - m0); // exchange h.x += (D1/cx)*(- m1.z); h.z -= (D1/cx)*(- m1.x); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); // right neighbor i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r2 = is0(m2)? r0 : regions[i_]; float A2 = aLUT2d[symidx(r0, r2)]; float D2 = dLUT2d[symidx(r0, r2)]; if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x - (cx * (0.5f*D2/A2) * m0.z); m2.y = m0.y; m2.z = m0.z + (cx * (0.5f*D2/A2) * m0.x); } h += (2.0f*A2/(cx*cx)) * (m2 - m0); h.x += (D2/cx)*(m2.z); h.z -= (D2/cx)*(m2.x); } } // y derivatives (along height) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A1 = aLUT2d[symidx(r0, r1)]; float D1 = dLUT2d[symidx(r0, r1)]; if (!is0(m1) || !OpenBC){ if (is0(m1)) { m1.x = m0.x; m1.y = m0.y - (-cy * (0.5f*D1/A1) * m0.z); m1.z = m0.z + (-cy * (0.5f*D1/A1) * m0.y); } h += (2.0f*A1/(cy*cy)) * (m1 - m0); h.y += (D1/cy)*(- m1.z); h.z -= (D1/cy)*(- m1.y); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r2 = is0(m2)? r0 : regions[i_]; float A2 = aLUT2d[symidx(r0, r2)]; float D2 = dLUT2d[symidx(r0, r2)]; if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x; m2.y = m0.y - (cy * (0.5f*D2/A2) * m0.z); m2.z = m0.z + (cy * (0.5f*D2/A2) * m0.y); } h += (2.0f*A2/(cy*cy)) * (m2 - m0); h.y += (D2/cy)*(m2.z); h.z -= (D2/cy)*(m2.y); } } // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor { i_ = idx(ix, iy, lclampz(iz-1)); float3 m1 = make_float3(mx[i_], my[i_], mz[i_]); m1 = ( is0(m1)? m0: m1 ); // Neumann BC float A1 = aLUT2d[symidx(r0, regions[i_])]; h += (2.0f*A1/(cz*cz)) * (m1 - m0); // Exchange only } // top neighbor { i_ = idx(ix, iy, hclampz(iz+1)); float3 m2 = make_float3(mx[i_], my[i_], mz[i_]); m2 = ( is0(m2)? m0: m2 ); float A2 = aLUT2d[symidx(r0, regions[i_])]; h += (2.0f*A2/(cz*cz)) * (m2 - m0); } } // write back, result is H + Hdmi + Hex float invMs = inv_Msat(Ms_, Ms_mul, I); Hx[I] += h.x*invMs; Hy[I] += h.y*invMs; Hz[I] += h.z*invMs; } // Note on boundary conditions. // // We need the derivative and laplacian of m in point A, but e.g. C lies out of the boundaries. // We use the boundary condition in B (derivative of the magnetization) to extrapolate m to point C: // m_C = m_A + (dm/dx)|_B * cellsize // // When point C is inside the boundary, we just use its actual value. // // Then we can take the central derivative in A: // (dm/dx)|_A = (m_C - m_D) / (2*cellsize) // And the laplacian: // lapl(m)|_A = (m_C + m_D - 2*m_A) / (cellsize^2) // // All these operations should be second order as they involve only central derivatives. // // ------------------------------------------------------------------ * // | | C | // | | ** | // | | *** | // | | *** | // | | *** | // | | *** | // | B | // | *** | | // | *** | | // | **** | | // | **** | | // | **** | | // | ** A | | // | ***** | | // | ****** | | // | ********* | | // |D ******** | | // | | | // +----------------+----------------+-----------------+---------------+ // -1 -0.5 0 0.5 1 // x 3-3.11.1/cuda/dmi.go000066400000000000000000000015321503346766200137700ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add effective field of Dzyaloshinskii-Moriya interaction to Beff (Tesla). // According to Bogdanov and Röβler, PRL 87, 3, 2001. eq.8 (out-of-plane symmetry breaking). // See dmi.cu func AddDMI(Beff *data.Slice, m *data.Slice, Aex, Dex SymmLUT, Msat MSlice, regions *Bytes, mesh *data.Mesh, OpenBC bool) { cellsize := mesh.CellSize() N := Beff.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) var openBC byte if OpenBC { openBC = 1 } k_adddmi_async(Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), unsafe.Pointer(Aex), unsafe.Pointer(Dex), regions.Ptr, float32(cellsize[X]), float32(cellsize[Y]), float32(cellsize[Z]), N[X], N[Y], N[Z], mesh.PBC_code(), openBC, cfg) } 3-3.11.1/cuda/dmi_wrapper.go000066400000000000000000006721531503346766200155450ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for adddmi kernel var adddmi_code cu.Function // Stores the arguments for adddmi kernel invocation type adddmi_args_t struct { arg_Hx unsafe.Pointer arg_Hy unsafe.Pointer arg_Hz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_aLUT2d unsafe.Pointer arg_dLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_cx float32 arg_cy float32 arg_cz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte arg_OpenBC byte argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adddmi kernel invocation var adddmi_args adddmi_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. adddmi_args.argptr[0] = unsafe.Pointer(&adddmi_args.arg_Hx) adddmi_args.argptr[1] = unsafe.Pointer(&adddmi_args.arg_Hy) adddmi_args.argptr[2] = unsafe.Pointer(&adddmi_args.arg_Hz) adddmi_args.argptr[3] = unsafe.Pointer(&adddmi_args.arg_mx) adddmi_args.argptr[4] = unsafe.Pointer(&adddmi_args.arg_my) adddmi_args.argptr[5] = unsafe.Pointer(&adddmi_args.arg_mz) adddmi_args.argptr[6] = unsafe.Pointer(&adddmi_args.arg_Ms_) adddmi_args.argptr[7] = unsafe.Pointer(&adddmi_args.arg_Ms_mul) adddmi_args.argptr[8] = unsafe.Pointer(&adddmi_args.arg_aLUT2d) adddmi_args.argptr[9] = unsafe.Pointer(&adddmi_args.arg_dLUT2d) adddmi_args.argptr[10] = unsafe.Pointer(&adddmi_args.arg_regions) adddmi_args.argptr[11] = unsafe.Pointer(&adddmi_args.arg_cx) adddmi_args.argptr[12] = unsafe.Pointer(&adddmi_args.arg_cy) adddmi_args.argptr[13] = unsafe.Pointer(&adddmi_args.arg_cz) adddmi_args.argptr[14] = unsafe.Pointer(&adddmi_args.arg_Nx) adddmi_args.argptr[15] = unsafe.Pointer(&adddmi_args.arg_Ny) adddmi_args.argptr[16] = unsafe.Pointer(&adddmi_args.arg_Nz) adddmi_args.argptr[17] = unsafe.Pointer(&adddmi_args.arg_PBC) adddmi_args.argptr[18] = unsafe.Pointer(&adddmi_args.arg_OpenBC) } // Wrapper for adddmi CUDA kernel, asynchronous. func k_adddmi_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, dLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("adddmi") } adddmi_args.Lock() defer adddmi_args.Unlock() if adddmi_code == 0 { adddmi_code = fatbinLoad(adddmi_map, "adddmi") } adddmi_args.arg_Hx = Hx adddmi_args.arg_Hy = Hy adddmi_args.arg_Hz = Hz adddmi_args.arg_mx = mx adddmi_args.arg_my = my adddmi_args.arg_mz = mz adddmi_args.arg_Ms_ = Ms_ adddmi_args.arg_Ms_mul = Ms_mul adddmi_args.arg_aLUT2d = aLUT2d adddmi_args.arg_dLUT2d = dLUT2d adddmi_args.arg_regions = regions adddmi_args.arg_cx = cx adddmi_args.arg_cy = cy adddmi_args.arg_cz = cz adddmi_args.arg_Nx = Nx adddmi_args.arg_Ny = Ny adddmi_args.arg_Nz = Nz adddmi_args.arg_PBC = PBC adddmi_args.arg_OpenBC = OpenBC args := adddmi_args.argptr[:] cu.LaunchKernel(adddmi_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adddmi") } } // maps compute capability on PTX code for adddmi kernel. var adddmi_map = map[int]string{0: "", 50: adddmi_ptx_50, 52: adddmi_ptx_52, 53: adddmi_ptx_53, 60: adddmi_ptx_60, 61: adddmi_ptx_61, 62: adddmi_ptx_62, 70: adddmi_ptx_70, 72: adddmi_ptx_72, 75: adddmi_ptx_75, 80: adddmi_ptx_80, 86: adddmi_ptx_86, 87: adddmi_ptx_87, 89: adddmi_ptx_89, 90: adddmi_ptx_90} // adddmi PTX code for various compute capabilities. const ( adddmi_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` adddmi_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<50>; .reg .b16 %rs<45>; .reg .f32 %f<303>; .reg .b32 %r<111>; .reg .b64 %rd<85>; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; ld.param.u64 %rd8, [adddmi_param_0]; ld.param.u64 %rd9, [adddmi_param_1]; ld.param.u64 %rd10, [adddmi_param_2]; ld.param.u64 %rd12, [adddmi_param_3]; ld.param.u64 %rd13, [adddmi_param_4]; ld.param.u64 %rd14, [adddmi_param_5]; ld.param.u64 %rd11, [adddmi_param_6]; ld.param.f32 %f301, [adddmi_param_7]; ld.param.u64 %rd15, [adddmi_param_8]; ld.param.u64 %rd16, [adddmi_param_9]; ld.param.u64 %rd17, [adddmi_param_10]; ld.param.f32 %f137, [adddmi_param_11]; ld.param.f32 %f138, [adddmi_param_12]; ld.param.f32 %f139, [adddmi_param_13]; ld.param.u32 %r34, [adddmi_param_14]; ld.param.u32 %r35, [adddmi_param_15]; ld.param.u32 %r36, [adddmi_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r38, %r37, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r41, %r40, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r44, %r43, %r45; setp.ge.s32 %p1, %r1, %r34; setp.ge.s32 %p2, %r2, %r35; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_59; mul.lo.s32 %r4, %r3, %r35; add.s32 %r46, %r4, %r2; mul.lo.s32 %r5, %r46, %r34; add.s32 %r47, %r5, %r1; cvt.s64.s32 %rd7, %r47; mul.wide.s32 %rd18, %r47, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; ld.global.nc.f32 %f6, [%rd21]; mul.f32 %f142, %f5, %f5; fma.rn.f32 %f143, %f1, %f1, %f142; fma.rn.f32 %f144, %f6, %f6, %f143; setp.eq.f32 %p6, %f144, 0f00000000; @%p6 bra $L__BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r48, %r6, %r34; add.s32 %r49, %r48, %r34; rem.s32 %r105, %r49, %r34; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f267, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f266, %f267; mov.f32 %f265, %f267; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f265, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f266, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f267, [%rd26]; $L__BB0_7: mul.f32 %f148, %f265, %f265; fma.rn.f32 %f149, %f266, %f266, %f148; fma.rn.f32 %f16, %f267, %f267, %f149; setp.eq.f32 %p11, %f16, 0f00000000; mov.u16 %rs41, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs41, [%rd28]; $L__BB0_9: min.u16 %rs17, %rs41, %rs1; cvt.u32.u16 %r50, %rs17; max.u16 %rs18, %rs41, %rs1; cvt.u32.u16 %r51, %rs18; add.s32 %r52, %r51, 1; mul.lo.s32 %r53, %r52, %r51; shr.u32 %r54, %r53, 1; add.s32 %r55, %r54, %r50; mul.wide.s32 %rd29, %r55, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f17, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f18, [%rd31]; setp.ne.s16 %p12, %rs14, 0; mov.f32 %f277, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f278, %f277; mov.f32 %f279, %f277; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f16, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f153, %f18, 0f3F000000; div.rn.f32 %f154, %f153, %f17; mul.f32 %f155, %f154, %f137; fma.rn.f32 %f265, %f6, %f155, %f1; mul.f32 %f156, %f1, %f155; sub.f32 %f267, %f6, %f156; mov.f32 %f266, %f5; $L__BB0_12: mul.f32 %f157, %f137, %f137; add.f32 %f158, %f17, %f17; div.rn.f32 %f159, %f158, %f157; sub.f32 %f160, %f265, %f1; sub.f32 %f161, %f266, %f5; sub.f32 %f162, %f267, %f6; fma.rn.f32 %f163, %f159, %f160, 0f00000000; fma.rn.f32 %f278, %f159, %f161, 0f00000000; fma.rn.f32 %f164, %f159, %f162, 0f00000000; div.rn.f32 %f165, %f18, %f137; mul.f32 %f166, %f165, %f267; sub.f32 %f277, %f163, %f166; fma.rn.f32 %f279, %f165, %f265, %f164; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r58, %r34, -1; min.s32 %r106, %r11, %r58; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r56, %r11, %r34; add.s32 %r57, %r56, %r34; rem.s32 %r106, %r57, %r34; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r34; mov.f32 %f276, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f275, %f276; mov.f32 %f274, %f276; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f274, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f275, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f276, [%rd35]; $L__BB0_18: mul.f32 %f170, %f274, %f274; fma.rn.f32 %f171, %f275, %f275, %f170; fma.rn.f32 %f44, %f276, %f276, %f171; setp.eq.f32 %p20, %f44, 0f00000000; mov.u16 %rs42, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs42, [%rd37]; $L__BB0_20: min.u16 %rs21, %rs42, %rs1; cvt.u32.u16 %r59, %rs21; max.u16 %rs22, %rs42, %rs1; cvt.u32.u16 %r60, %rs22; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; mul.wide.s32 %rd38, %r64, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f45, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f46, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f44, 0f00000000; @%p24 bra $L__BB0_23; mul.f32 %f172, %f46, 0f3F000000; div.rn.f32 %f173, %f172, %f45; mul.f32 %f174, %f173, %f137; mul.f32 %f175, %f6, %f174; sub.f32 %f274, %f1, %f175; fma.rn.f32 %f276, %f1, %f174, %f6; mov.f32 %f275, %f5; $L__BB0_23: mul.f32 %f176, %f137, %f137; add.f32 %f177, %f45, %f45; div.rn.f32 %f178, %f177, %f176; sub.f32 %f179, %f274, %f1; sub.f32 %f180, %f275, %f5; sub.f32 %f181, %f276, %f6; fma.rn.f32 %f182, %f178, %f179, %f277; fma.rn.f32 %f278, %f178, %f180, %f278; fma.rn.f32 %f183, %f178, %f181, %f279; div.rn.f32 %f184, %f46, %f137; fma.rn.f32 %f277, %f184, %f276, %f182; mul.f32 %f185, %f184, %f274; sub.f32 %f279, %f183, %f185; $L__BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r65, %r16, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r107, %r66, %r35; $L__BB0_27: add.s32 %r67, %r107, %r4; mad.lo.s32 %r20, %r67, %r34, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f285, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f284, %f285; mov.f32 %f283, %f285; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f283, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f284, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f285, [%rd44]; $L__BB0_29: mul.f32 %f189, %f283, %f283; fma.rn.f32 %f190, %f284, %f284, %f189; fma.rn.f32 %f72, %f285, %f285, %f190; setp.eq.f32 %p29, %f72, 0f00000000; mov.u16 %rs43, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs43, [%rd46]; $L__BB0_31: min.u16 %rs25, %rs43, %rs1; cvt.u32.u16 %r68, %rs25; max.u16 %rs26, %rs43, %rs1; cvt.u32.u16 %r69, %rs26; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; mul.wide.s32 %rd47, %r73, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f73, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f74, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f72, 0f00000000; @%p33 bra $L__BB0_34; mul.f32 %f191, %f74, 0f3F000000; div.rn.f32 %f192, %f191, %f73; mul.f32 %f193, %f192, %f138; fma.rn.f32 %f284, %f6, %f193, %f5; mul.f32 %f194, %f5, %f193; sub.f32 %f285, %f6, %f194; mov.f32 %f283, %f1; $L__BB0_34: mul.f32 %f195, %f138, %f138; add.f32 %f196, %f73, %f73; div.rn.f32 %f197, %f196, %f195; sub.f32 %f198, %f283, %f1; sub.f32 %f199, %f284, %f5; sub.f32 %f200, %f285, %f6; fma.rn.f32 %f277, %f197, %f198, %f277; fma.rn.f32 %f201, %f197, %f199, %f278; fma.rn.f32 %f202, %f197, %f200, %f279; div.rn.f32 %f203, %f74, %f138; mul.f32 %f204, %f203, %f285; sub.f32 %f278, %f201, %f204; fma.rn.f32 %f279, %f203, %f284, %f202; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r76, %r35, -1; min.s32 %r108, %r21, %r76; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r74, %r21, %r35; add.s32 %r75, %r74, %r35; rem.s32 %r108, %r75, %r35; $L__BB0_38: add.s32 %r77, %r108, %r4; mad.lo.s32 %r25, %r77, %r34, %r1; setp.ge.s32 %p35, %r21, %r35; mov.f32 %f294, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f293, %f294; mov.f32 %f292, %f294; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f292, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f293, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f294, [%rd53]; $L__BB0_40: mul.f32 %f208, %f292, %f292; fma.rn.f32 %f209, %f293, %f293, %f208; fma.rn.f32 %f100, %f294, %f294, %f209; setp.eq.f32 %p38, %f100, 0f00000000; mov.u16 %rs44, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs44, [%rd55]; $L__BB0_42: min.u16 %rs29, %rs44, %rs1; cvt.u32.u16 %r78, %rs29; max.u16 %rs30, %rs44, %rs1; cvt.u32.u16 %r79, %rs30; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd56, %r83, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f101, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f102, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f100, 0f00000000; @%p42 bra $L__BB0_45; mul.f32 %f210, %f102, 0f3F000000; div.rn.f32 %f211, %f210, %f101; mul.f32 %f212, %f211, %f138; mul.f32 %f213, %f6, %f212; sub.f32 %f293, %f5, %f213; fma.rn.f32 %f294, %f5, %f212, %f6; mov.f32 %f292, %f1; $L__BB0_45: mul.f32 %f214, %f138, %f138; add.f32 %f215, %f101, %f101; div.rn.f32 %f216, %f215, %f214; sub.f32 %f217, %f292, %f1; sub.f32 %f218, %f293, %f5; sub.f32 %f219, %f294, %f6; fma.rn.f32 %f277, %f216, %f217, %f277; fma.rn.f32 %f220, %f216, %f218, %f278; fma.rn.f32 %f221, %f216, %f219, %f279; div.rn.f32 %f222, %f102, %f138; fma.rn.f32 %f278, %f222, %f294, %f220; mul.f32 %f223, %f222, %f293; sub.f32 %f279, %f221, %f223; $L__BB0_46: setp.eq.s32 %p43, %r36, 1; @%p43 bra $L__BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p44, %rs12, 0; add.s32 %r26, %r3, -1; @%p44 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r84, %r26, %r36; add.s32 %r85, %r84, %r36; rem.s32 %r109, %r85, %r36; $L__BB0_50: mad.lo.s32 %r86, %r109, %r35, %r2; mad.lo.s32 %r87, %r86, %r34, %r1; cvt.s64.s32 %rd59, %r87; mul.wide.s32 %rd60, %r87, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f224, [%rd63]; ld.global.nc.f32 %f225, [%rd61]; ld.global.nc.f32 %f226, [%rd62]; mul.f32 %f227, %f226, %f226; fma.rn.f32 %f228, %f225, %f225, %f227; fma.rn.f32 %f229, %f224, %f224, %f228; setp.eq.f32 %p45, %f229, 0f00000000; selp.f32 %f230, %f6, %f224, %p45; selp.f32 %f231, %f5, %f226, %p45; selp.f32 %f232, %f1, %f225, %p45; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs31, [%rd64]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r88, %rs35; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs34; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd65, %r93, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f233, [%rd66]; add.f32 %f234, %f233, %f233; mul.f32 %f122, %f139, %f139; div.rn.f32 %f235, %f234, %f122; sub.f32 %f236, %f232, %f1; sub.f32 %f237, %f231, %f5; sub.f32 %f238, %f230, %f6; fma.rn.f32 %f123, %f236, %f235, %f277; fma.rn.f32 %f124, %f237, %f235, %f278; fma.rn.f32 %f125, %f238, %f235, %f279; add.s32 %r30, %r3, 1; @%p44 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r96, %r36, -1; min.s32 %r110, %r30, %r96; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r94, %r30, %r36; add.s32 %r95, %r94, %r36; rem.s32 %r110, %r95, %r36; $L__BB0_53: mad.lo.s32 %r97, %r110, %r35, %r2; mad.lo.s32 %r98, %r97, %r34, %r1; cvt.s64.s32 %rd67, %r98; mul.wide.s32 %rd68, %r98, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f239, [%rd71]; ld.global.nc.f32 %f240, [%rd69]; ld.global.nc.f32 %f241, [%rd70]; mul.f32 %f242, %f241, %f241; fma.rn.f32 %f243, %f240, %f240, %f242; fma.rn.f32 %f244, %f239, %f239, %f243; setp.eq.f32 %p47, %f244, 0f00000000; selp.f32 %f245, %f6, %f239, %p47; selp.f32 %f246, %f5, %f241, %p47; selp.f32 %f247, %f1, %f240, %p47; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs36, [%rd72]; min.u16 %rs39, %rs36, %rs1; max.u16 %rs40, %rs36, %rs1; cvt.u32.u16 %r99, %rs40; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs39; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd73, %r104, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f248, [%rd74]; add.f32 %f249, %f248, %f248; div.rn.f32 %f250, %f249, %f122; sub.f32 %f251, %f247, %f1; sub.f32 %f252, %f246, %f5; sub.f32 %f253, %f245, %f6; fma.rn.f32 %f277, %f251, %f250, %f123; fma.rn.f32 %f278, %f252, %f250, %f124; fma.rn.f32 %f279, %f253, %f250, %f125; $L__BB0_54: setp.eq.s64 %p48, %rd11, 0; @%p48 bra $L__BB0_56; cvta.to.global.u64 %rd75, %rd11; shl.b64 %rd76, %rd7, 2; add.s64 %rd77, %rd75, %rd76; ld.global.nc.f32 %f254, [%rd77]; mul.f32 %f301, %f254, %f301; $L__BB0_56: setp.eq.f32 %p49, %f301, 0f00000000; mov.f32 %f302, 0f00000000; @%p49 bra $L__BB0_58; rcp.rn.f32 %f302, %f301; $L__BB0_58: cvta.to.global.u64 %rd78, %rd8; shl.b64 %rd79, %rd7, 2; add.s64 %rd80, %rd78, %rd79; ld.global.f32 %f256, [%rd80]; fma.rn.f32 %f257, %f277, %f302, %f256; st.global.f32 [%rd80], %f257; cvta.to.global.u64 %rd81, %rd9; add.s64 %rd82, %rd81, %rd79; ld.global.f32 %f258, [%rd82]; fma.rn.f32 %f259, %f278, %f302, %f258; st.global.f32 [%rd82], %f259; cvta.to.global.u64 %rd83, %rd10; add.s64 %rd84, %rd83, %rd79; ld.global.f32 %f260, [%rd84]; fma.rn.f32 %f261, %f279, %f302, %f260; st.global.f32 [%rd84], %f261; $L__BB0_59: ret; } ` ) 3-3.11.1/cuda/dmibulk.cu000066400000000000000000000212161503346766200146510ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" #include "amul.h" // Exchange + Dzyaloshinskii-Moriya interaction for bulk material. // Energy: // // E = D M . rot(M) // // Effective field: // // Hx = 2A/Bs nabla²Mx + 2D/Bs dzMy - 2D/Bs dyMz // Hy = 2A/Bs nabla²My + 2D/Bs dxMz - 2D/Bs dzMx // Hz = 2A/Bs nabla²Mz + 2D/Bs dyMx - 2D/Bs dxMy // // Boundary conditions: // // 2A dxMx = 0 // D Mz + 2A dxMy = 0 // -D My + 2A dxMz = 0 // // -D Mz + 2A dyMx = 0 // 2A dyMy = 0 // D Mx + 2A dyMz = 0 // // D My + 2A dzMx = 0 // -D Mx + 2A dzMy = 0 // 2A dzMz = 0 // extern "C" __global__ void adddmibulk(float* __restrict__ Hx, float* __restrict__ Hy, float* __restrict__ Hz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ aLUT2d, float* __restrict__ DLUT2d, uint8_t* __restrict__ regions, float cx, float cy, float cz, int Nx, int Ny, int Nz, uint8_t PBC, uint8_t OpenBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 h = make_float3(0.0,0.0,0.0); // add to H float3 m0 = make_float3(mx[I], my[I], mz[I]); // central m uint8_t r0 = regions[I]; int i_; // neighbor index if(is0(m0)) { return; } // x derivatives (along length) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m1) || !OpenBC){ // do nothing at an open boundary if (is0(m1)) { // neighbor missing m1.x = m0.x; m1.y = m0.y - (-cx * D_2A * m0.z); m1.z = m0.z + (-cx * D_2A * m0.y); } h += (2.0f*A/(cx*cx)) * (m1 - m0); // exchange h.y += (D/cx)*(-m1.z); h.z -= (D/cx)*(-m1.y); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); // right neighbor i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m2)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x; m2.y = m0.y - (+cx * D_2A * m0.z); m2.z = m0.z + (+cx * D_2A * m0.y); } h += (2.0f*A/(cx*cx)) * (m2 - m0); h.y += (D/cx)*(m2.z); h.z -= (D/cx)*(m2.y); } } // y derivatives (along height) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m1) || !OpenBC){ if (is0(m1)) { m1.x = m0.x + (-cy * D_2A * m0.z); m1.y = m0.y; m1.z = m0.z - (-cy * D_2A * m0.x); } h += (2.0f*A/(cy*cy)) * (m1 - m0); h.x -= (D/cy)*(-m1.z); h.z += (D/cy)*(-m1.x); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m2)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x + (+cy * D_2A * m0.z); m2.y = m0.y; m2.z = m0.z - (+cy * D_2A * m0.x); } h += (2.0f*A/(cy*cy)) * (m2 - m0); h.x -= (D/cy)*(m2.z); h.z += (D/cy)*(m2.x); } } // only take vertical derivative for 3D sim or for Neumann BC if ((Nz != 1) || (!OpenBC)) { // bottom neighbor { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-1)); if (iz-1 >= 0 || PBCz) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m1) || !OpenBC){ if (is0(m1)) { m1.x = m0.x - (-cz * D_2A * m0.y); m1.y = m0.y + (-cz * D_2A * m0.x); m1.z = m0.z; } h += (2.0f*A/(cz*cz)) * (m1 - m0); h.x += (D/cz)*(- m1.y); h.y -= (D/cz)*(- m1.x); } } // top neighbor { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+1)); if (iz+1 < Nz || PBCz) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m2)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x - (+cz * D_2A * m0.y); m2.y = m0.y + (+cz * D_2A * m0.x); m2.z = m0.z; } h += (2.0f*A/(cz*cz)) * (m2 - m0); h.x += (D/cz)*(m2.y ); h.y -= (D/cz)*(m2.x ); } } } // write back, result is H + Hdmi + Hex float invMs = inv_Msat(Ms_, Ms_mul, I); Hx[I] += h.x*invMs; Hy[I] += h.y*invMs; Hz[I] += h.z*invMs; } // Note on boundary conditions. // // We need the derivative and laplacian of m in point A, but e.g. C lies out of the boundaries. // We use the boundary condition in B (derivative of the magnetization) to extrapolate m to point C: // m_C = m_A + (dm/dx)|_B * cellsize // // When point C is inside the boundary, we just use its actual value. // // Then we can take the central derivative in A: // (dm/dx)|_A = (m_C - m_D) / (2*cellsize) // And the laplacian: // lapl(m)|_A = (m_C + m_D - 2*m_A) / (cellsize^2) // // All these operations should be second order as they involve only central derivatives. // // ------------------------------------------------------------------ * // | | C | // | | ** | // | | *** | // | | *** | // | | *** | // | | *** | // | B | // | *** | | // | *** | | // | **** | | // | **** | | // | **** | | // | ** A | | // | ***** | | // | ****** | | // | ********* | | // |D ******** | | // | | | // +----------------+----------------+-----------------+---------------+ // -1 -0.5 0 0.5 1 // x 3-3.11.1/cuda/dmibulk.go000066400000000000000000000014041503346766200146440ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add effective field due to bulk Dzyaloshinskii-Moriya interaction to Beff. // See dmibulk.cu func AddDMIBulk(Beff *data.Slice, m *data.Slice, Aex, D SymmLUT, Msat MSlice, regions *Bytes, mesh *data.Mesh, OpenBC bool) { cellsize := mesh.CellSize() N := Beff.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) var openBC byte if OpenBC { openBC = 1 } k_adddmibulk_async(Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), unsafe.Pointer(Aex), unsafe.Pointer(D), regions.Ptr, float32(cellsize[X]), float32(cellsize[Y]), float32(cellsize[Z]), N[X], N[Y], N[Z], mesh.PBC_code(), openBC, cfg) } 3-3.11.1/cuda/dmibulk_wrapper.go000066400000000000000000007405111503346766200164150ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for adddmibulk kernel var adddmibulk_code cu.Function // Stores the arguments for adddmibulk kernel invocation type adddmibulk_args_t struct { arg_Hx unsafe.Pointer arg_Hy unsafe.Pointer arg_Hz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_aLUT2d unsafe.Pointer arg_DLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_cx float32 arg_cy float32 arg_cz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte arg_OpenBC byte argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adddmibulk kernel invocation var adddmibulk_args adddmibulk_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. adddmibulk_args.argptr[0] = unsafe.Pointer(&adddmibulk_args.arg_Hx) adddmibulk_args.argptr[1] = unsafe.Pointer(&adddmibulk_args.arg_Hy) adddmibulk_args.argptr[2] = unsafe.Pointer(&adddmibulk_args.arg_Hz) adddmibulk_args.argptr[3] = unsafe.Pointer(&adddmibulk_args.arg_mx) adddmibulk_args.argptr[4] = unsafe.Pointer(&adddmibulk_args.arg_my) adddmibulk_args.argptr[5] = unsafe.Pointer(&adddmibulk_args.arg_mz) adddmibulk_args.argptr[6] = unsafe.Pointer(&adddmibulk_args.arg_Ms_) adddmibulk_args.argptr[7] = unsafe.Pointer(&adddmibulk_args.arg_Ms_mul) adddmibulk_args.argptr[8] = unsafe.Pointer(&adddmibulk_args.arg_aLUT2d) adddmibulk_args.argptr[9] = unsafe.Pointer(&adddmibulk_args.arg_DLUT2d) adddmibulk_args.argptr[10] = unsafe.Pointer(&adddmibulk_args.arg_regions) adddmibulk_args.argptr[11] = unsafe.Pointer(&adddmibulk_args.arg_cx) adddmibulk_args.argptr[12] = unsafe.Pointer(&adddmibulk_args.arg_cy) adddmibulk_args.argptr[13] = unsafe.Pointer(&adddmibulk_args.arg_cz) adddmibulk_args.argptr[14] = unsafe.Pointer(&adddmibulk_args.arg_Nx) adddmibulk_args.argptr[15] = unsafe.Pointer(&adddmibulk_args.arg_Ny) adddmibulk_args.argptr[16] = unsafe.Pointer(&adddmibulk_args.arg_Nz) adddmibulk_args.argptr[17] = unsafe.Pointer(&adddmibulk_args.arg_PBC) adddmibulk_args.argptr[18] = unsafe.Pointer(&adddmibulk_args.arg_OpenBC) } // Wrapper for adddmibulk CUDA kernel, asynchronous. func k_adddmibulk_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, DLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("adddmibulk") } adddmibulk_args.Lock() defer adddmibulk_args.Unlock() if adddmibulk_code == 0 { adddmibulk_code = fatbinLoad(adddmibulk_map, "adddmibulk") } adddmibulk_args.arg_Hx = Hx adddmibulk_args.arg_Hy = Hy adddmibulk_args.arg_Hz = Hz adddmibulk_args.arg_mx = mx adddmibulk_args.arg_my = my adddmibulk_args.arg_mz = mz adddmibulk_args.arg_Ms_ = Ms_ adddmibulk_args.arg_Ms_mul = Ms_mul adddmibulk_args.arg_aLUT2d = aLUT2d adddmibulk_args.arg_DLUT2d = DLUT2d adddmibulk_args.arg_regions = regions adddmibulk_args.arg_cx = cx adddmibulk_args.arg_cy = cy adddmibulk_args.arg_cz = cz adddmibulk_args.arg_Nx = Nx adddmibulk_args.arg_Ny = Ny adddmibulk_args.arg_Nz = Nz adddmibulk_args.arg_PBC = PBC adddmibulk_args.arg_OpenBC = OpenBC args := adddmibulk_args.argptr[:] cu.LaunchKernel(adddmibulk_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adddmibulk") } } // maps compute capability on PTX code for adddmibulk kernel. var adddmibulk_map = map[int]string{0: "", 50: adddmibulk_ptx_50, 52: adddmibulk_ptx_52, 53: adddmibulk_ptx_53, 60: adddmibulk_ptx_60, 61: adddmibulk_ptx_61, 62: adddmibulk_ptx_62, 70: adddmibulk_ptx_70, 72: adddmibulk_ptx_72, 75: adddmibulk_ptx_75, 80: adddmibulk_ptx_80, 86: adddmibulk_ptx_86, 87: adddmibulk_ptx_87, 89: adddmibulk_ptx_89, 90: adddmibulk_ptx_90} // adddmibulk PTX code for various compute capabilities. const ( adddmibulk_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` adddmibulk_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<66>; .reg .b16 %rs<49>; .reg .f32 %f<363>; .reg .b32 %r<111>; .reg .b64 %rd<87>; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; ld.param.u64 %rd8, [adddmibulk_param_0]; ld.param.u64 %rd9, [adddmibulk_param_1]; ld.param.u64 %rd10, [adddmibulk_param_2]; ld.param.u64 %rd12, [adddmibulk_param_3]; ld.param.u64 %rd13, [adddmibulk_param_4]; ld.param.u64 %rd14, [adddmibulk_param_5]; ld.param.u64 %rd11, [adddmibulk_param_6]; ld.param.f32 %f361, [adddmibulk_param_7]; ld.param.u64 %rd15, [adddmibulk_param_8]; ld.param.u64 %rd16, [adddmibulk_param_9]; ld.param.u64 %rd17, [adddmibulk_param_10]; ld.param.f32 %f180, [adddmibulk_param_11]; ld.param.f32 %f181, [adddmibulk_param_12]; ld.param.f32 %f182, [adddmibulk_param_13]; ld.param.u32 %r36, [adddmibulk_param_14]; ld.param.u32 %r37, [adddmibulk_param_15]; ld.param.u32 %r38, [adddmibulk_param_16]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; cvta.to.global.u64 %rd6, %rd12; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r40, %r39, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r43, %r42, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r46, %r45, %r47; setp.ge.s32 %p1, %r1, %r36; setp.ge.s32 %p2, %r2, %r37; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_74; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r49, %r5, %r1; cvt.s64.s32 %rd7, %r49; mul.wide.s32 %rd18, %r49, 4; add.s64 %rd19, %rd6, %rd18; add.s64 %rd20, %rd5, %rd18; add.s64 %rd21, %rd4, %rd18; add.s64 %rd22, %rd3, %rd7; ld.global.nc.u8 %rs1, [%rd22]; ld.global.nc.f32 %f1, [%rd19]; ld.global.nc.f32 %f5, [%rd20]; mul.f32 %f185, %f5, %f5; fma.rn.f32 %f186, %f1, %f1, %f185; ld.global.nc.f32 %f6, [%rd21]; fma.rn.f32 %f187, %f6, %f6, %f186; setp.eq.f32 %p6, %f187, 0f00000000; @%p6 bra $L__BB0_74; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r50, %r6, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r105, %r51, %r36; $L__BB0_5: add.s32 %r10, %r105, %r5; setp.lt.s32 %p9, %r1, 1; mov.f32 %f312, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f311, %f312; mov.f32 %f310, %f312; @%p10 bra $L__BB0_7; mul.wide.s32 %rd23, %r10, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f310, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f311, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f312, [%rd26]; $L__BB0_7: mul.f32 %f191, %f310, %f310; fma.rn.f32 %f192, %f311, %f311, %f191; fma.rn.f32 %f13, %f312, %f312, %f192; setp.eq.f32 %p11, %f13, 0f00000000; mov.u16 %rs43, %rs1; @%p11 bra $L__BB0_9; cvt.s64.s32 %rd27, %r10; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs43, [%rd28]; $L__BB0_9: min.u16 %rs21, %rs43, %rs1; cvt.u32.u16 %r52, %rs21; max.u16 %rs22, %rs43, %rs1; cvt.u32.u16 %r53, %rs22; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd29, %r57, 4; add.s64 %rd30, %rd2, %rd29; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f196, [%rd30]; add.f32 %f14, %f196, %f196; ld.global.nc.f32 %f15, [%rd31]; setp.ne.s16 %p12, %rs18, 0; mov.f32 %f322, 0f00000000; and.pred %p14, %p12, %p11; mov.f32 %f323, %f322; mov.f32 %f324, %f322; @%p14 bra $L__BB0_13; setp.neu.f32 %p15, %f13, 0f00000000; @%p15 bra $L__BB0_12; div.rn.f32 %f197, %f15, %f14; mul.f32 %f198, %f197, %f180; fma.rn.f32 %f311, %f6, %f198, %f5; mul.f32 %f199, %f5, %f198; sub.f32 %f312, %f6, %f199; mov.f32 %f310, %f1; $L__BB0_12: mul.f32 %f200, %f180, %f180; div.rn.f32 %f201, %f14, %f200; sub.f32 %f202, %f310, %f1; sub.f32 %f203, %f311, %f5; sub.f32 %f204, %f312, %f6; fma.rn.f32 %f322, %f201, %f202, 0f00000000; fma.rn.f32 %f205, %f201, %f203, 0f00000000; fma.rn.f32 %f206, %f201, %f204, 0f00000000; div.rn.f32 %f207, %f15, %f180; mul.f32 %f208, %f207, %f312; sub.f32 %f323, %f205, %f208; fma.rn.f32 %f324, %f207, %f311, %f206; $L__BB0_13: add.s32 %r11, %r1, 1; @%p7 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: add.s32 %r60, %r36, -1; min.s32 %r106, %r11, %r60; bra.uni $L__BB0_16; $L__BB0_14: rem.s32 %r58, %r11, %r36; add.s32 %r59, %r58, %r36; rem.s32 %r106, %r59, %r36; $L__BB0_16: add.s32 %r15, %r106, %r5; setp.ge.s32 %p17, %r11, %r36; mov.f32 %f321, 0f00000000; and.pred %p19, %p17, %p7; mov.f32 %f320, %f321; mov.f32 %f319, %f321; @%p19 bra $L__BB0_18; mul.wide.s32 %rd32, %r15, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f319, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f320, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f321, [%rd35]; $L__BB0_18: mul.f32 %f212, %f319, %f319; fma.rn.f32 %f213, %f320, %f320, %f212; fma.rn.f32 %f41, %f321, %f321, %f213; setp.eq.f32 %p20, %f41, 0f00000000; mov.u16 %rs44, %rs1; @%p20 bra $L__BB0_20; cvt.s64.s32 %rd36, %r15; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs44, [%rd37]; $L__BB0_20: min.u16 %rs25, %rs44, %rs1; cvt.u32.u16 %r61, %rs25; max.u16 %rs26, %rs44, %rs1; cvt.u32.u16 %r62, %rs26; add.s32 %r63, %r62, 1; mul.lo.s32 %r64, %r63, %r62; shr.u32 %r65, %r64, 1; add.s32 %r66, %r65, %r61; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd2, %rd38; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f214, [%rd39]; add.f32 %f42, %f214, %f214; ld.global.nc.f32 %f43, [%rd40]; and.pred %p23, %p12, %p20; @%p23 bra $L__BB0_24; setp.neu.f32 %p24, %f41, 0f00000000; @%p24 bra $L__BB0_23; div.rn.f32 %f215, %f43, %f42; mul.f32 %f216, %f215, %f180; mul.f32 %f217, %f6, %f216; sub.f32 %f320, %f5, %f217; fma.rn.f32 %f321, %f5, %f216, %f6; mov.f32 %f319, %f1; $L__BB0_23: mul.f32 %f218, %f180, %f180; div.rn.f32 %f219, %f42, %f218; sub.f32 %f220, %f319, %f1; sub.f32 %f221, %f320, %f5; sub.f32 %f222, %f321, %f6; fma.rn.f32 %f322, %f219, %f220, %f322; fma.rn.f32 %f223, %f219, %f221, %f323; fma.rn.f32 %f224, %f219, %f222, %f324; div.rn.f32 %f225, %f43, %f180; fma.rn.f32 %f323, %f225, %f321, %f223; mul.f32 %f226, %f225, %f320; sub.f32 %f324, %f224, %f226; $L__BB0_24: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p25, %rs7, 0; add.s32 %r16, %r2, -1; @%p25 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: max.s32 %r107, %r16, 0; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r67, %r16, %r37; add.s32 %r68, %r67, %r37; rem.s32 %r107, %r68, %r37; $L__BB0_27: add.s32 %r69, %r107, %r4; mad.lo.s32 %r20, %r69, %r36, %r1; setp.lt.s32 %p27, %r2, 1; mov.f32 %f330, 0f00000000; and.pred %p28, %p27, %p25; mov.f32 %f329, %f330; mov.f32 %f328, %f330; @%p28 bra $L__BB0_29; mul.wide.s32 %rd41, %r20, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f328, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f329, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f330, [%rd44]; $L__BB0_29: mul.f32 %f230, %f328, %f328; fma.rn.f32 %f231, %f329, %f329, %f230; fma.rn.f32 %f69, %f330, %f330, %f231; setp.eq.f32 %p29, %f69, 0f00000000; mov.u16 %rs45, %rs1; @%p29 bra $L__BB0_31; cvt.s64.s32 %rd45, %r20; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs45, [%rd46]; $L__BB0_31: min.u16 %rs29, %rs45, %rs1; cvt.u32.u16 %r70, %rs29; max.u16 %rs30, %rs45, %rs1; cvt.u32.u16 %r71, %rs30; add.s32 %r72, %r71, 1; mul.lo.s32 %r73, %r72, %r71; shr.u32 %r74, %r73, 1; add.s32 %r75, %r74, %r70; mul.wide.s32 %rd47, %r75, 4; add.s64 %rd48, %rd2, %rd47; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f232, [%rd48]; add.f32 %f70, %f232, %f232; ld.global.nc.f32 %f71, [%rd49]; and.pred %p32, %p12, %p29; @%p32 bra $L__BB0_35; setp.neu.f32 %p33, %f69, 0f00000000; @%p33 bra $L__BB0_34; div.rn.f32 %f233, %f71, %f70; mul.f32 %f234, %f233, %f181; mul.f32 %f235, %f6, %f234; sub.f32 %f328, %f1, %f235; fma.rn.f32 %f330, %f1, %f234, %f6; mov.f32 %f329, %f5; $L__BB0_34: mul.f32 %f236, %f181, %f181; div.rn.f32 %f237, %f70, %f236; sub.f32 %f238, %f328, %f1; sub.f32 %f239, %f329, %f5; sub.f32 %f240, %f330, %f6; fma.rn.f32 %f241, %f237, %f238, %f322; fma.rn.f32 %f323, %f237, %f239, %f323; fma.rn.f32 %f242, %f237, %f240, %f324; div.rn.f32 %f243, %f71, %f181; fma.rn.f32 %f322, %f243, %f330, %f241; mul.f32 %f244, %f243, %f328; sub.f32 %f324, %f242, %f244; $L__BB0_35: add.s32 %r21, %r2, 1; @%p25 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: add.s32 %r78, %r37, -1; min.s32 %r108, %r21, %r78; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r76, %r21, %r37; add.s32 %r77, %r76, %r37; rem.s32 %r108, %r77, %r37; $L__BB0_38: add.s32 %r79, %r108, %r4; mad.lo.s32 %r25, %r79, %r36, %r1; setp.ge.s32 %p35, %r21, %r37; mov.f32 %f339, 0f00000000; and.pred %p37, %p35, %p25; mov.f32 %f338, %f339; mov.f32 %f337, %f339; @%p37 bra $L__BB0_40; mul.wide.s32 %rd50, %r25, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f337, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f338, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f339, [%rd53]; $L__BB0_40: mul.f32 %f248, %f337, %f337; fma.rn.f32 %f249, %f338, %f338, %f248; fma.rn.f32 %f97, %f339, %f339, %f249; setp.eq.f32 %p38, %f97, 0f00000000; mov.u16 %rs46, %rs1; @%p38 bra $L__BB0_42; cvt.s64.s32 %rd54, %r25; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs46, [%rd55]; $L__BB0_42: min.u16 %rs33, %rs46, %rs1; cvt.u32.u16 %r80, %rs33; max.u16 %rs34, %rs46, %rs1; cvt.u32.u16 %r81, %rs34; add.s32 %r82, %r81, 1; mul.lo.s32 %r83, %r82, %r81; shr.u32 %r84, %r83, 1; add.s32 %r85, %r84, %r80; mul.wide.s32 %rd56, %r85, 4; add.s64 %rd57, %rd2, %rd56; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f250, [%rd57]; add.f32 %f98, %f250, %f250; ld.global.nc.f32 %f99, [%rd58]; and.pred %p41, %p12, %p38; @%p41 bra $L__BB0_46; setp.neu.f32 %p42, %f97, 0f00000000; @%p42 bra $L__BB0_45; div.rn.f32 %f251, %f99, %f98; mul.f32 %f252, %f251, %f181; fma.rn.f32 %f337, %f6, %f252, %f1; mul.f32 %f253, %f1, %f252; sub.f32 %f339, %f6, %f253; mov.f32 %f338, %f5; $L__BB0_45: mul.f32 %f254, %f181, %f181; div.rn.f32 %f255, %f98, %f254; sub.f32 %f256, %f337, %f1; sub.f32 %f257, %f338, %f5; sub.f32 %f258, %f339, %f6; fma.rn.f32 %f259, %f255, %f256, %f322; fma.rn.f32 %f323, %f255, %f257, %f323; fma.rn.f32 %f260, %f255, %f258, %f324; div.rn.f32 %f261, %f99, %f181; mul.f32 %f262, %f261, %f339; sub.f32 %f322, %f259, %f262; fma.rn.f32 %f324, %f261, %f337, %f260; $L__BB0_46: setp.eq.s32 %p43, %r38, 1; and.pred %p45, %p43, %p12; @%p45 bra $L__BB0_69; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p46, %rs12, 0; add.s32 %r26, %r3, -1; @%p46 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r109, %r26, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r86, %r26, %r38; add.s32 %r87, %r86, %r38; rem.s32 %r109, %r87, %r38; $L__BB0_50: mad.lo.s32 %r88, %r109, %r37, %r2; mad.lo.s32 %r30, %r88, %r36, %r1; setp.lt.s32 %p48, %r3, 1; mov.f32 %f348, 0f00000000; and.pred %p49, %p48, %p46; mov.f32 %f347, %f348; mov.f32 %f346, %f348; @%p49 bra $L__BB0_52; mul.wide.s32 %rd59, %r30, 4; add.s64 %rd60, %rd6, %rd59; ld.global.nc.f32 %f346, [%rd60]; add.s64 %rd61, %rd5, %rd59; ld.global.nc.f32 %f347, [%rd61]; add.s64 %rd62, %rd4, %rd59; ld.global.nc.f32 %f348, [%rd62]; $L__BB0_52: mul.f32 %f266, %f346, %f346; fma.rn.f32 %f267, %f347, %f347, %f266; fma.rn.f32 %f125, %f348, %f348, %f267; setp.eq.f32 %p50, %f125, 0f00000000; mov.u16 %rs47, %rs1; @%p50 bra $L__BB0_54; cvt.s64.s32 %rd63, %r30; add.s64 %rd64, %rd3, %rd63; ld.global.nc.u8 %rs47, [%rd64]; $L__BB0_54: min.u16 %rs37, %rs47, %rs1; cvt.u32.u16 %r89, %rs37; max.u16 %rs38, %rs47, %rs1; cvt.u32.u16 %r90, %rs38; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; add.s32 %r94, %r93, %r89; mul.wide.s32 %rd65, %r94, 4; add.s64 %rd66, %rd2, %rd65; add.s64 %rd67, %rd1, %rd65; ld.global.nc.f32 %f268, [%rd66]; add.f32 %f126, %f268, %f268; ld.global.nc.f32 %f127, [%rd67]; and.pred %p53, %p12, %p50; @%p53 bra $L__BB0_58; setp.neu.f32 %p54, %f125, 0f00000000; @%p54 bra $L__BB0_57; div.rn.f32 %f269, %f127, %f126; mul.f32 %f270, %f269, %f182; fma.rn.f32 %f346, %f5, %f270, %f1; mul.f32 %f271, %f1, %f270; sub.f32 %f347, %f5, %f271; mov.f32 %f348, %f6; $L__BB0_57: mul.f32 %f272, %f182, %f182; div.rn.f32 %f273, %f126, %f272; sub.f32 %f274, %f346, %f1; sub.f32 %f275, %f347, %f5; sub.f32 %f276, %f348, %f6; fma.rn.f32 %f277, %f273, %f274, %f322; fma.rn.f32 %f278, %f273, %f275, %f323; fma.rn.f32 %f324, %f273, %f276, %f324; div.rn.f32 %f279, %f127, %f182; mul.f32 %f280, %f279, %f347; sub.f32 %f322, %f277, %f280; fma.rn.f32 %f323, %f279, %f346, %f278; $L__BB0_58: add.s32 %r31, %r3, 1; @%p46 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r97, %r38, -1; min.s32 %r110, %r31, %r97; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r95, %r31, %r38; add.s32 %r96, %r95, %r38; rem.s32 %r110, %r96, %r38; $L__BB0_61: mad.lo.s32 %r98, %r110, %r37, %r2; mad.lo.s32 %r35, %r98, %r36, %r1; setp.ge.s32 %p56, %r31, %r38; mov.f32 %f357, 0f00000000; and.pred %p58, %p56, %p46; mov.f32 %f356, %f357; mov.f32 %f355, %f357; @%p58 bra $L__BB0_63; mul.wide.s32 %rd68, %r35, 4; add.s64 %rd69, %rd6, %rd68; ld.global.nc.f32 %f355, [%rd69]; add.s64 %rd70, %rd5, %rd68; ld.global.nc.f32 %f356, [%rd70]; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f357, [%rd71]; $L__BB0_63: mul.f32 %f284, %f355, %f355; fma.rn.f32 %f285, %f356, %f356, %f284; fma.rn.f32 %f153, %f357, %f357, %f285; setp.eq.f32 %p59, %f153, 0f00000000; mov.u16 %rs48, %rs1; @%p59 bra $L__BB0_65; cvt.s64.s32 %rd72, %r35; add.s64 %rd73, %rd3, %rd72; ld.global.nc.u8 %rs48, [%rd73]; $L__BB0_65: min.u16 %rs41, %rs48, %rs1; cvt.u32.u16 %r99, %rs41; max.u16 %rs42, %rs48, %rs1; cvt.u32.u16 %r100, %rs42; add.s32 %r101, %r100, 1; mul.lo.s32 %r102, %r101, %r100; shr.u32 %r103, %r102, 1; add.s32 %r104, %r103, %r99; mul.wide.s32 %rd74, %r104, 4; add.s64 %rd75, %rd2, %rd74; add.s64 %rd76, %rd1, %rd74; ld.global.nc.f32 %f286, [%rd75]; add.f32 %f154, %f286, %f286; ld.global.nc.f32 %f155, [%rd76]; and.pred %p62, %p12, %p59; @%p62 bra $L__BB0_69; setp.neu.f32 %p63, %f153, 0f00000000; @%p63 bra $L__BB0_68; div.rn.f32 %f287, %f155, %f154; mul.f32 %f288, %f287, %f182; mul.f32 %f289, %f5, %f288; sub.f32 %f355, %f1, %f289; fma.rn.f32 %f356, %f1, %f288, %f5; mov.f32 %f357, %f6; $L__BB0_68: mul.f32 %f290, %f182, %f182; div.rn.f32 %f291, %f154, %f290; sub.f32 %f292, %f355, %f1; sub.f32 %f293, %f356, %f5; sub.f32 %f294, %f357, %f6; fma.rn.f32 %f295, %f291, %f292, %f322; fma.rn.f32 %f296, %f291, %f293, %f323; fma.rn.f32 %f324, %f291, %f294, %f324; div.rn.f32 %f297, %f155, %f182; fma.rn.f32 %f322, %f297, %f356, %f295; mul.f32 %f298, %f297, %f355; sub.f32 %f323, %f296, %f298; $L__BB0_69: setp.eq.s64 %p64, %rd11, 0; @%p64 bra $L__BB0_71; cvta.to.global.u64 %rd77, %rd11; shl.b64 %rd78, %rd7, 2; add.s64 %rd79, %rd77, %rd78; ld.global.nc.f32 %f299, [%rd79]; mul.f32 %f361, %f299, %f361; $L__BB0_71: setp.eq.f32 %p65, %f361, 0f00000000; mov.f32 %f362, 0f00000000; @%p65 bra $L__BB0_73; rcp.rn.f32 %f362, %f361; $L__BB0_73: cvta.to.global.u64 %rd80, %rd8; shl.b64 %rd81, %rd7, 2; add.s64 %rd82, %rd80, %rd81; ld.global.f32 %f301, [%rd82]; fma.rn.f32 %f302, %f322, %f362, %f301; st.global.f32 [%rd82], %f302; cvta.to.global.u64 %rd83, %rd9; add.s64 %rd84, %rd83, %rd81; ld.global.f32 %f303, [%rd84]; fma.rn.f32 %f304, %f323, %f362, %f303; st.global.f32 [%rd84], %f304; cvta.to.global.u64 %rd85, %rd10; add.s64 %rd86, %rd85, %rd81; ld.global.f32 %f305, [%rd86]; fma.rn.f32 %f306, %f324, %f362, %f305; st.global.f32 [%rd86], %f306; $L__BB0_74: ret; } ` ) 3-3.11.1/cuda/dotproduct.cu000066400000000000000000000010461503346766200154100ustar00rootroot00000000000000 #include "float3.h" // dst += prefactor * dot(a,b) extern "C" __global__ void dotproduct(float* __restrict__ dst, float prefactor, float* __restrict__ ax, float* __restrict__ ay, float* __restrict__ az, float* __restrict__ bx, float* __restrict__ by, float* __restrict__ bz, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 A = {ax[i], ay[i], az[i]}; float3 B = {bx[i], by[i], bz[i]}; dst[i] += prefactor * dot(A, B); } } 3-3.11.1/cuda/dotproduct.go000066400000000000000000000010141503346766200154010ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // dst += prefactor * dot(a, b), as used for energy density func AddDotProduct(dst *data.Slice, prefactor float32, a, b *data.Slice) { util.Argument(dst.NComp() == 1 && a.NComp() == 3 && b.NComp() == 3) util.Argument(dst.Len() == a.Len() && dst.Len() == b.Len()) N := dst.Len() cfg := make1DConf(N) k_dotproduct_async(dst.DevPtr(0), prefactor, a.DevPtr(X), a.DevPtr(Y), a.DevPtr(Z), b.DevPtr(X), b.DevPtr(Y), b.DevPtr(Z), N, cfg) } 3-3.11.1/cuda/dotproduct_wrapper.go000066400000000000000000000746271503346766200171650ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for dotproduct kernel var dotproduct_code cu.Function // Stores the arguments for dotproduct kernel invocation type dotproduct_args_t struct { arg_dst unsafe.Pointer arg_prefactor float32 arg_ax unsafe.Pointer arg_ay unsafe.Pointer arg_az unsafe.Pointer arg_bx unsafe.Pointer arg_by unsafe.Pointer arg_bz unsafe.Pointer arg_N int argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for dotproduct kernel invocation var dotproduct_args dotproduct_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. dotproduct_args.argptr[0] = unsafe.Pointer(&dotproduct_args.arg_dst) dotproduct_args.argptr[1] = unsafe.Pointer(&dotproduct_args.arg_prefactor) dotproduct_args.argptr[2] = unsafe.Pointer(&dotproduct_args.arg_ax) dotproduct_args.argptr[3] = unsafe.Pointer(&dotproduct_args.arg_ay) dotproduct_args.argptr[4] = unsafe.Pointer(&dotproduct_args.arg_az) dotproduct_args.argptr[5] = unsafe.Pointer(&dotproduct_args.arg_bx) dotproduct_args.argptr[6] = unsafe.Pointer(&dotproduct_args.arg_by) dotproduct_args.argptr[7] = unsafe.Pointer(&dotproduct_args.arg_bz) dotproduct_args.argptr[8] = unsafe.Pointer(&dotproduct_args.arg_N) } // Wrapper for dotproduct CUDA kernel, asynchronous. func k_dotproduct_async(dst unsafe.Pointer, prefactor float32, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("dotproduct") } dotproduct_args.Lock() defer dotproduct_args.Unlock() if dotproduct_code == 0 { dotproduct_code = fatbinLoad(dotproduct_map, "dotproduct") } dotproduct_args.arg_dst = dst dotproduct_args.arg_prefactor = prefactor dotproduct_args.arg_ax = ax dotproduct_args.arg_ay = ay dotproduct_args.arg_az = az dotproduct_args.arg_bx = bx dotproduct_args.arg_by = by dotproduct_args.arg_bz = bz dotproduct_args.arg_N = N args := dotproduct_args.argptr[:] cu.LaunchKernel(dotproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("dotproduct") } } // maps compute capability on PTX code for dotproduct kernel. var dotproduct_map = map[int]string{0: "", 50: dotproduct_ptx_50, 52: dotproduct_ptx_52, 53: dotproduct_ptx_53, 60: dotproduct_ptx_60, 61: dotproduct_ptx_61, 62: dotproduct_ptx_62, 70: dotproduct_ptx_70, 72: dotproduct_ptx_72, 75: dotproduct_ptx_75, 80: dotproduct_ptx_80, 86: dotproduct_ptx_86, 87: dotproduct_ptx_87, 89: dotproduct_ptx_89, 90: dotproduct_ptx_90} // dotproduct PTX code for various compute capabilities. const ( dotproduct_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` dotproduct_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/exchange.cu000066400000000000000000000052371503346766200150110ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" #include "amul.h" // See exchange.go for more details. extern "C" __global__ void addexchange(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ aLUT2d, uint8_t* __restrict__ regions, float wx, float wy, float wz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } // central cell int I = idx(ix, iy, iz); float3 m0 = make_float3(mx[I], my[I], mz[I]); if (is0(m0)) { return; } uint8_t r0 = regions[I]; float3 B = make_float3(0.0,0.0,0.0); int i_; // neighbor index float3 m_; // neighbor mag float a__; // inter-cell exchange stiffness // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // clamps or wraps index according to PBC m_ = make_float3(mx[i_], my[i_], mz[i_]); // load m m_ = ( is0(m_)? m0: m_ ); // replace missing non-boundary neighbor a__ = aLUT2d[symidx(r0, regions[i_])]; B += wx * a__ *(m_ - m0); // right neighbor i_ = idx(hclampx(ix+1), iy, iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wx * a__ *(m_ - m0); // back neighbor i_ = idx(ix, lclampy(iy-1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wy * a__ *(m_ - m0); // front neighbor i_ = idx(ix, hclampy(iy+1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wy * a__ *(m_ - m0); // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor i_ = idx(ix, iy, lclampz(iz-1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wz * a__ *(m_ - m0); // top neighbor i_ = idx(ix, iy, hclampz(iz+1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wz * a__ *(m_ - m0); } float invMs = inv_Msat(Ms_, Ms_mul, I); Bx[I] += B.x*invMs; By[I] += B.y*invMs; Bz[I] += B.z*invMs; } 3-3.11.1/cuda/exchange.go000066400000000000000000000022161503346766200150010ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" ) // Add exchange field to Beff. // // m: normalized magnetization // B: effective field in Tesla // Aex: exchange stiffness // // see exchange.cu func AddExchange(B, m *data.Slice, Aex SymmLUT, Msat MSlice, regions *Bytes, mesh *data.Mesh) { c := mesh.CellSize() wx := float32(2 / (c[X] * c[X])) wy := float32(2 / (c[Y] * c[Y])) wz := float32(2 / (c[Z] * c[Z])) N := mesh.Size() pbc := mesh.PBC_code() cfg := make3DConf(N) k_addexchange_async(B.DevPtr(X), B.DevPtr(Y), B.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), unsafe.Pointer(Aex), regions.Ptr, wx, wy, wz, N[X], N[Y], N[Z], pbc, cfg) } // Finds the average exchange strength around each cell, for debugging. func ExchangeDecode(dst *data.Slice, Aex SymmLUT, regions *Bytes, mesh *data.Mesh) { c := mesh.CellSize() wx := float32(2 / (c[X] * c[X])) wy := float32(2 / (c[Y] * c[Y])) wz := float32(2 / (c[Z] * c[Z])) N := mesh.Size() pbc := mesh.PBC_code() cfg := make3DConf(N) k_exchangedecode_async(dst.DevPtr(0), unsafe.Pointer(Aex), regions.Ptr, wx, wy, wz, N[X], N[Y], N[Z], pbc, cfg) } 3-3.11.1/cuda/exchange.h000066400000000000000000000002601503346766200146200ustar00rootroot00000000000000#ifndef _EXCHANGE_H_ #define _EXCHANGE_H_ // indexing in symmetric matrix #define symidx(i, j) ( (j<=i)? ( (((i)*((i)+1)) /2 )+(j) ) : ( (((j)*((j)+1)) /2 )+(i) ) ) #endif 3-3.11.1/cuda/exchange_wrapper.go000066400000000000000000005347241503346766200165570ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addexchange kernel var addexchange_code cu.Function // Stores the arguments for addexchange kernel invocation type addexchange_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_aLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_wx float32 arg_wy float32 arg_wz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [17]unsafe.Pointer sync.Mutex } // Stores the arguments for addexchange kernel invocation var addexchange_args addexchange_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addexchange_args.argptr[0] = unsafe.Pointer(&addexchange_args.arg_Bx) addexchange_args.argptr[1] = unsafe.Pointer(&addexchange_args.arg_By) addexchange_args.argptr[2] = unsafe.Pointer(&addexchange_args.arg_Bz) addexchange_args.argptr[3] = unsafe.Pointer(&addexchange_args.arg_mx) addexchange_args.argptr[4] = unsafe.Pointer(&addexchange_args.arg_my) addexchange_args.argptr[5] = unsafe.Pointer(&addexchange_args.arg_mz) addexchange_args.argptr[6] = unsafe.Pointer(&addexchange_args.arg_Ms_) addexchange_args.argptr[7] = unsafe.Pointer(&addexchange_args.arg_Ms_mul) addexchange_args.argptr[8] = unsafe.Pointer(&addexchange_args.arg_aLUT2d) addexchange_args.argptr[9] = unsafe.Pointer(&addexchange_args.arg_regions) addexchange_args.argptr[10] = unsafe.Pointer(&addexchange_args.arg_wx) addexchange_args.argptr[11] = unsafe.Pointer(&addexchange_args.arg_wy) addexchange_args.argptr[12] = unsafe.Pointer(&addexchange_args.arg_wz) addexchange_args.argptr[13] = unsafe.Pointer(&addexchange_args.arg_Nx) addexchange_args.argptr[14] = unsafe.Pointer(&addexchange_args.arg_Ny) addexchange_args.argptr[15] = unsafe.Pointer(&addexchange_args.arg_Nz) addexchange_args.argptr[16] = unsafe.Pointer(&addexchange_args.arg_PBC) } // Wrapper for addexchange CUDA kernel, asynchronous. func k_addexchange_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("addexchange") } addexchange_args.Lock() defer addexchange_args.Unlock() if addexchange_code == 0 { addexchange_code = fatbinLoad(addexchange_map, "addexchange") } addexchange_args.arg_Bx = Bx addexchange_args.arg_By = By addexchange_args.arg_Bz = Bz addexchange_args.arg_mx = mx addexchange_args.arg_my = my addexchange_args.arg_mz = mz addexchange_args.arg_Ms_ = Ms_ addexchange_args.arg_Ms_mul = Ms_mul addexchange_args.arg_aLUT2d = aLUT2d addexchange_args.arg_regions = regions addexchange_args.arg_wx = wx addexchange_args.arg_wy = wy addexchange_args.arg_wz = wz addexchange_args.arg_Nx = Nx addexchange_args.arg_Ny = Ny addexchange_args.arg_Nz = Nz addexchange_args.arg_PBC = PBC args := addexchange_args.argptr[:] cu.LaunchKernel(addexchange_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addexchange") } } // maps compute capability on PTX code for addexchange kernel. var addexchange_map = map[int]string{0: "", 50: addexchange_ptx_50, 52: addexchange_ptx_52, 53: addexchange_ptx_53, 60: addexchange_ptx_60, 61: addexchange_ptx_61, 62: addexchange_ptx_62, 70: addexchange_ptx_70, 72: addexchange_ptx_72, 75: addexchange_ptx_75, 80: addexchange_ptx_80, 86: addexchange_ptx_86, 87: addexchange_ptx_87, 89: addexchange_ptx_89, 90: addexchange_ptx_90} // addexchange PTX code for various compute capabilities. const ( addexchange_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` addexchange_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<22>; .reg .b16 %rs<36>; .reg .f32 %f<136>; .reg .b32 %r<111>; .reg .b64 %rd<79>; ld.param.u8 %rs5, [addexchange_param_16]; ld.param.u64 %rd7, [addexchange_param_0]; ld.param.u64 %rd8, [addexchange_param_1]; ld.param.u64 %rd9, [addexchange_param_2]; ld.param.u64 %rd11, [addexchange_param_3]; ld.param.u64 %rd12, [addexchange_param_4]; ld.param.u64 %rd13, [addexchange_param_5]; ld.param.u64 %rd10, [addexchange_param_6]; ld.param.f32 %f134, [addexchange_param_7]; ld.param.u64 %rd14, [addexchange_param_8]; ld.param.u64 %rd15, [addexchange_param_9]; ld.param.f32 %f33, [addexchange_param_10]; ld.param.f32 %f34, [addexchange_param_11]; ld.param.f32 %f35, [addexchange_param_12]; ld.param.u32 %r30, [addexchange_param_13]; ld.param.u32 %r31, [addexchange_param_14]; ld.param.u32 %r32, [addexchange_param_15]; cvta.to.global.u64 %rd1, %rd14; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd13; cvta.to.global.u64 %rd4, %rd12; cvta.to.global.u64 %rd5, %rd11; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_27; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd16, %r43, 4; add.s64 %rd17, %rd5, %rd16; add.s64 %rd18, %rd4, %rd16; add.s64 %rd19, %rd3, %rd16; ld.global.nc.f32 %f1, [%rd17]; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd19]; mul.f32 %f36, %f2, %f2; fma.rn.f32 %f37, %f1, %f1, %f36; fma.rn.f32 %f38, %f3, %f3, %f37; setp.eq.f32 %p6, %f38, 0f00000000; @%p6 bra $L__BB0_27; add.s64 %rd20, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd20]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r105, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_5: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd21, %r46; mul.wide.s32 %rd22, %r46, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f39, [%rd25]; ld.global.nc.f32 %f40, [%rd23]; ld.global.nc.f32 %f41, [%rd24]; mul.f32 %f42, %f41, %f41; fma.rn.f32 %f43, %f40, %f40, %f42; fma.rn.f32 %f44, %f39, %f39, %f43; setp.eq.f32 %p8, %f44, 0f00000000; selp.f32 %f45, %f3, %f39, %p8; selp.f32 %f46, %f2, %f41, %p8; selp.f32 %f47, %f1, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd27, %r52, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f48, [%rd28]; mul.f32 %f49, %f48, %f33; sub.f32 %f50, %f47, %f1; sub.f32 %f51, %f46, %f2; sub.f32 %f52, %f45, %f3; fma.rn.f32 %f7, %f50, %f49, 0f00000000; fma.rn.f32 %f8, %f51, %f49, 0f00000000; fma.rn.f32 %f9, %f52, %f49, 0f00000000; add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_8: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd29, %r56; mul.wide.s32 %rd30, %r56, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f53, [%rd33]; ld.global.nc.f32 %f54, [%rd31]; ld.global.nc.f32 %f55, [%rd32]; mul.f32 %f56, %f55, %f55; fma.rn.f32 %f57, %f54, %f54, %f56; fma.rn.f32 %f58, %f53, %f53, %f57; setp.eq.f32 %p10, %f58, 0f00000000; selp.f32 %f59, %f3, %f53, %p10; selp.f32 %f60, %f2, %f55, %p10; selp.f32 %f61, %f1, %f54, %p10; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs11, [%rd34]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd35, %r62, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f62, [%rd36]; mul.f32 %f63, %f62, %f33; sub.f32 %f64, %f61, %f1; sub.f32 %f65, %f60, %f2; sub.f32 %f66, %f59, %f3; fma.rn.f32 %f10, %f64, %f63, %f7; fma.rn.f32 %f11, %f65, %f63, %f8; fma.rn.f32 %f12, %f66, %f63, %f9; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p11, %rs3, 0; add.s32 %r14, %r2, -1; @%p11 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r107, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_11: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd37, %r66; mul.wide.s32 %rd38, %r66, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f67, [%rd41]; ld.global.nc.f32 %f68, [%rd39]; ld.global.nc.f32 %f69, [%rd40]; mul.f32 %f70, %f69, %f69; fma.rn.f32 %f71, %f68, %f68, %f70; fma.rn.f32 %f72, %f67, %f67, %f71; setp.eq.f32 %p12, %f72, 0f00000000; selp.f32 %f73, %f3, %f67, %p12; selp.f32 %f74, %f2, %f69, %p12; selp.f32 %f75, %f1, %f68, %p12; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs16, [%rd42]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd43, %r72, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f76, [%rd44]; mul.f32 %f77, %f76, %f34; sub.f32 %f78, %f75, %f1; sub.f32 %f79, %f74, %f2; sub.f32 %f80, %f73, %f3; fma.rn.f32 %f13, %f78, %f77, %f10; fma.rn.f32 %f14, %f79, %f77, %f11; fma.rn.f32 %f15, %f80, %f77, %f12; add.s32 %r18, %r2, 1; @%p11 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_14: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd45, %r77; mul.wide.s32 %rd46, %r77, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f81, [%rd49]; ld.global.nc.f32 %f82, [%rd47]; ld.global.nc.f32 %f83, [%rd48]; mul.f32 %f84, %f83, %f83; fma.rn.f32 %f85, %f82, %f82, %f84; fma.rn.f32 %f86, %f81, %f81, %f85; setp.eq.f32 %p14, %f86, 0f00000000; selp.f32 %f87, %f3, %f81, %p14; selp.f32 %f88, %f2, %f83, %p14; selp.f32 %f89, %f1, %f82, %p14; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs21, [%rd50]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd51, %r83, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f90, [%rd52]; mul.f32 %f91, %f90, %f34; sub.f32 %f92, %f89, %f1; sub.f32 %f93, %f88, %f2; sub.f32 %f94, %f87, %f3; fma.rn.f32 %f133, %f92, %f91, %f13; fma.rn.f32 %f132, %f93, %f91, %f14; fma.rn.f32 %f131, %f94, %f91, %f15; setp.eq.s32 %p15, %r32, 1; @%p15 bra $L__BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p16, %rs4, 0; add.s32 %r22, %r3, -1; @%p16 bra $L__BB0_17; bra.uni $L__BB0_16; $L__BB0_17: max.s32 %r109, %r22, 0; bra.uni $L__BB0_18; $L__BB0_16: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_18: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd53, %r87; mul.wide.s32 %rd54, %r87, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f95, [%rd57]; ld.global.nc.f32 %f96, [%rd55]; ld.global.nc.f32 %f97, [%rd56]; mul.f32 %f98, %f97, %f97; fma.rn.f32 %f99, %f96, %f96, %f98; fma.rn.f32 %f100, %f95, %f95, %f99; setp.eq.f32 %p17, %f100, 0f00000000; selp.f32 %f101, %f3, %f95, %p17; selp.f32 %f102, %f2, %f97, %p17; selp.f32 %f103, %f1, %f96, %p17; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs26, [%rd58]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd59, %r93, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f104, [%rd60]; mul.f32 %f105, %f104, %f35; sub.f32 %f106, %f103, %f1; sub.f32 %f107, %f102, %f2; sub.f32 %f108, %f101, %f3; fma.rn.f32 %f19, %f106, %f105, %f133; fma.rn.f32 %f20, %f107, %f105, %f132; fma.rn.f32 %f21, %f108, %f105, %f131; add.s32 %r26, %r3, 1; @%p16 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_21: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd61, %r98; mul.wide.s32 %rd62, %r98, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f109, [%rd65]; ld.global.nc.f32 %f110, [%rd63]; ld.global.nc.f32 %f111, [%rd64]; mul.f32 %f112, %f111, %f111; fma.rn.f32 %f113, %f110, %f110, %f112; fma.rn.f32 %f114, %f109, %f109, %f113; setp.eq.f32 %p19, %f114, 0f00000000; selp.f32 %f115, %f3, %f109, %p19; selp.f32 %f116, %f2, %f111, %p19; selp.f32 %f117, %f1, %f110, %p19; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs31, [%rd66]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd67, %r104, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f118, [%rd68]; mul.f32 %f119, %f118, %f35; sub.f32 %f120, %f117, %f1; sub.f32 %f121, %f116, %f2; sub.f32 %f122, %f115, %f3; fma.rn.f32 %f133, %f120, %f119, %f19; fma.rn.f32 %f132, %f121, %f119, %f20; fma.rn.f32 %f131, %f122, %f119, %f21; $L__BB0_22: setp.eq.s64 %p20, %rd10, 0; @%p20 bra $L__BB0_24; cvta.to.global.u64 %rd69, %rd10; shl.b64 %rd70, %rd6, 2; add.s64 %rd71, %rd69, %rd70; ld.global.nc.f32 %f123, [%rd71]; mul.f32 %f134, %f123, %f134; $L__BB0_24: setp.eq.f32 %p21, %f134, 0f00000000; mov.f32 %f135, 0f00000000; @%p21 bra $L__BB0_26; rcp.rn.f32 %f135, %f134; $L__BB0_26: cvta.to.global.u64 %rd72, %rd7; shl.b64 %rd73, %rd6, 2; add.s64 %rd74, %rd72, %rd73; ld.global.f32 %f125, [%rd74]; fma.rn.f32 %f126, %f133, %f135, %f125; st.global.f32 [%rd74], %f126; cvta.to.global.u64 %rd75, %rd8; add.s64 %rd76, %rd75, %rd73; ld.global.f32 %f127, [%rd76]; fma.rn.f32 %f128, %f132, %f135, %f127; st.global.f32 [%rd76], %f128; cvta.to.global.u64 %rd77, %rd9; add.s64 %rd78, %rd77, %rd73; ld.global.f32 %f129, [%rd78]; fma.rn.f32 %f130, %f131, %f135, %f129; st.global.f32 [%rd78], %f130; $L__BB0_27: ret; } ` ) 3-3.11.1/cuda/exchangedecode.cu000066400000000000000000000026761503346766200161610ustar00rootroot00000000000000#include #include "stencil.h" #include "float3.h" #include "exchange.h" // see exchange.go extern "C" __global__ void exchangedecode(float* __restrict__ dst, float* __restrict__ aLUT2d, uint8_t* __restrict__ regions, float wx, float wy, float wz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } // central cell int I = idx(ix, iy, iz); uint8_t r0 = regions[I]; int i_; // neighbor index float avg = 0.0f; // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // clamps or wraps index according to PBC avg += aLUT2d[symidx(r0, regions[i_])]; // right neighbor i_ = idx(hclampx(ix+1), iy, iz); avg += aLUT2d[symidx(r0, regions[i_])]; // back neighbor i_ = idx(ix, lclampy(iy-1), iz); avg += aLUT2d[symidx(r0, regions[i_])]; // front neighbor i_ = idx(ix, hclampy(iy+1), iz); avg += aLUT2d[symidx(r0, regions[i_])]; // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor i_ = idx(ix, iy, lclampz(iz-1)); avg += aLUT2d[symidx(r0, regions[i_])]; // top neighbor i_ = idx(ix, iy, hclampz(iz+1)); avg += aLUT2d[symidx(r0, regions[i_])]; } dst[I] = avg; } 3-3.11.1/cuda/exchangedecode_wrapper.go000066400000000000000000002711611503346766200177140ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for exchangedecode kernel var exchangedecode_code cu.Function // Stores the arguments for exchangedecode kernel invocation type exchangedecode_args_t struct { arg_dst unsafe.Pointer arg_aLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_wx float32 arg_wy float32 arg_wz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for exchangedecode kernel invocation var exchangedecode_args exchangedecode_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. exchangedecode_args.argptr[0] = unsafe.Pointer(&exchangedecode_args.arg_dst) exchangedecode_args.argptr[1] = unsafe.Pointer(&exchangedecode_args.arg_aLUT2d) exchangedecode_args.argptr[2] = unsafe.Pointer(&exchangedecode_args.arg_regions) exchangedecode_args.argptr[3] = unsafe.Pointer(&exchangedecode_args.arg_wx) exchangedecode_args.argptr[4] = unsafe.Pointer(&exchangedecode_args.arg_wy) exchangedecode_args.argptr[5] = unsafe.Pointer(&exchangedecode_args.arg_wz) exchangedecode_args.argptr[6] = unsafe.Pointer(&exchangedecode_args.arg_Nx) exchangedecode_args.argptr[7] = unsafe.Pointer(&exchangedecode_args.arg_Ny) exchangedecode_args.argptr[8] = unsafe.Pointer(&exchangedecode_args.arg_Nz) exchangedecode_args.argptr[9] = unsafe.Pointer(&exchangedecode_args.arg_PBC) } // Wrapper for exchangedecode CUDA kernel, asynchronous. func k_exchangedecode_async(dst unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("exchangedecode") } exchangedecode_args.Lock() defer exchangedecode_args.Unlock() if exchangedecode_code == 0 { exchangedecode_code = fatbinLoad(exchangedecode_map, "exchangedecode") } exchangedecode_args.arg_dst = dst exchangedecode_args.arg_aLUT2d = aLUT2d exchangedecode_args.arg_regions = regions exchangedecode_args.arg_wx = wx exchangedecode_args.arg_wy = wy exchangedecode_args.arg_wz = wz exchangedecode_args.arg_Nx = Nx exchangedecode_args.arg_Ny = Ny exchangedecode_args.arg_Nz = Nz exchangedecode_args.arg_PBC = PBC args := exchangedecode_args.argptr[:] cu.LaunchKernel(exchangedecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("exchangedecode") } } // maps compute capability on PTX code for exchangedecode kernel. var exchangedecode_map = map[int]string{0: "", 50: exchangedecode_ptx_50, 52: exchangedecode_ptx_52, 53: exchangedecode_ptx_53, 60: exchangedecode_ptx_60, 61: exchangedecode_ptx_61, 62: exchangedecode_ptx_62, 70: exchangedecode_ptx_70, 72: exchangedecode_ptx_72, 75: exchangedecode_ptx_75, 80: exchangedecode_ptx_80, 86: exchangedecode_ptx_86, 87: exchangedecode_ptx_87, 89: exchangedecode_ptx_89, 90: exchangedecode_ptx_90} // exchangedecode PTX code for various compute capabilities. const ( exchangedecode_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` exchangedecode_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<13>; .reg .b16 %rs<36>; .reg .f32 %f<15>; .reg .b32 %r<111>; .reg .b64 %rd<35>; ld.param.u8 %rs5, [exchangedecode_param_9]; ld.param.u64 %rd4, [exchangedecode_param_0]; ld.param.u64 %rd5, [exchangedecode_param_1]; ld.param.u64 %rd6, [exchangedecode_param_2]; ld.param.u32 %r30, [exchangedecode_param_6]; ld.param.u32 %r31, [exchangedecode_param_7]; ld.param.u32 %r32, [exchangedecode_param_8]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd6; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_22; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd3, %r43; add.s64 %rd7, %rd2, %rd3; ld.global.nc.u8 %rs1, [%rd7]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r6, %r1, -1; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r105, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r105, %r45, %r30; $L__BB0_4: add.s32 %r46, %r105, %r5; cvt.s64.s32 %rd8, %r46; add.s64 %rd9, %rd2, %rd8; ld.global.nc.u8 %rs6, [%rd9]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; add.s32 %r10, %r1, 1; @%p6 bra $L__BB0_6; bra.uni $L__BB0_5; $L__BB0_6: add.s32 %r55, %r30, -1; min.s32 %r106, %r10, %r55; bra.uni $L__BB0_7; $L__BB0_5: rem.s32 %r53, %r10, %r30; add.s32 %r54, %r53, %r30; rem.s32 %r106, %r54, %r30; $L__BB0_7: add.s32 %r56, %r106, %r5; cvt.s64.s32 %rd12, %r56; add.s64 %rd13, %rd2, %rd12; ld.global.nc.u8 %rs11, [%rd13]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r57, %rs15; add.s32 %r58, %r57, 1; mul.lo.s32 %r59, %r58, %r57; shr.u32 %r60, %r59, 1; cvt.u32.u16 %r61, %rs14; add.s32 %r62, %r60, %r61; mul.wide.s32 %rd14, %r62, 4; add.s64 %rd15, %rd1, %rd14; ld.global.nc.f32 %f8, [%rd15]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p8, %rs3, 0; add.s32 %r14, %r2, -1; @%p8 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r107, %r14, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r63, %r14, %r31; add.s32 %r64, %r63, %r31; rem.s32 %r107, %r64, %r31; $L__BB0_10: add.s32 %r65, %r107, %r4; mad.lo.s32 %r66, %r65, %r30, %r1; cvt.s64.s32 %rd16, %r66; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs16, [%rd17]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r67, %rs20; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; cvt.u32.u16 %r71, %rs19; add.s32 %r72, %r70, %r71; mul.wide.s32 %rd18, %r72, 4; add.s64 %rd19, %rd1, %rd18; ld.global.nc.f32 %f10, [%rd19]; add.f32 %f3, %f2, %f10; add.s32 %r18, %r2, 1; @%p8 bra $L__BB0_12; bra.uni $L__BB0_11; $L__BB0_12: add.s32 %r75, %r31, -1; min.s32 %r108, %r18, %r75; bra.uni $L__BB0_13; $L__BB0_11: rem.s32 %r73, %r18, %r31; add.s32 %r74, %r73, %r31; rem.s32 %r108, %r74, %r31; $L__BB0_13: add.s32 %r76, %r108, %r4; mad.lo.s32 %r77, %r76, %r30, %r1; cvt.s64.s32 %rd20, %r77; add.s64 %rd21, %rd2, %rd20; ld.global.nc.u8 %rs21, [%rd21]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r78, %rs25; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; cvt.u32.u16 %r82, %rs24; add.s32 %r83, %r81, %r82; mul.wide.s32 %rd22, %r83, 4; add.s64 %rd23, %rd1, %rd22; ld.global.nc.f32 %f11, [%rd23]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p10, %r32, 1; @%p10 bra $L__BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p11, %rs4, 0; add.s32 %r22, %r3, -1; @%p11 bra $L__BB0_16; bra.uni $L__BB0_15; $L__BB0_16: max.s32 %r109, %r22, 0; bra.uni $L__BB0_17; $L__BB0_15: rem.s32 %r84, %r22, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r109, %r85, %r32; $L__BB0_17: mad.lo.s32 %r86, %r109, %r31, %r2; mad.lo.s32 %r87, %r86, %r30, %r1; cvt.s64.s32 %rd24, %r87; add.s64 %rd25, %rd2, %rd24; ld.global.nc.u8 %rs26, [%rd25]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r88, %rs30; add.s32 %r89, %r88, 1; mul.lo.s32 %r90, %r89, %r88; shr.u32 %r91, %r90, 1; cvt.u32.u16 %r92, %rs29; add.s32 %r93, %r91, %r92; mul.wide.s32 %rd26, %r93, 4; add.s64 %rd27, %rd1, %rd26; ld.global.nc.f32 %f12, [%rd27]; add.f32 %f5, %f14, %f12; add.s32 %r26, %r3, 1; @%p11 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r96, %r32, -1; min.s32 %r110, %r26, %r96; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r94, %r26, %r32; add.s32 %r95, %r94, %r32; rem.s32 %r110, %r95, %r32; $L__BB0_20: mad.lo.s32 %r97, %r110, %r31, %r2; mad.lo.s32 %r98, %r97, %r30, %r1; cvt.s64.s32 %rd28, %r98; add.s64 %rd29, %rd2, %rd28; ld.global.nc.u8 %rs31, [%rd29]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r99, %rs35; add.s32 %r100, %r99, 1; mul.lo.s32 %r101, %r100, %r99; shr.u32 %r102, %r101, 1; cvt.u32.u16 %r103, %rs34; add.s32 %r104, %r102, %r103; mul.wide.s32 %rd30, %r104, 4; add.s64 %rd31, %rd1, %rd30; ld.global.nc.f32 %f13, [%rd31]; add.f32 %f14, %f5, %f13; $L__BB0_21: cvta.to.global.u64 %rd32, %rd4; shl.b64 %rd33, %rd3, 2; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f14; $L__BB0_22: ret; } ` ) 3-3.11.1/cuda/fatbin.go000066400000000000000000000014231503346766200144610ustar00rootroot00000000000000package cuda import ( "log" "github.com/mumax/3/cuda/cu" ) // load PTX code for function name, find highest SM that matches our card. func fatbinLoad(sm map[int]string, fn string) cu.Function { cc := determineCC() return cu.ModuleLoadData(sm[cc]).GetFunction(fn) } var UseCC = 0 func determineCC() int { if UseCC != 0 { return UseCC } for k := range madd2_map { if k > UseCC && ccIsOK(k) { UseCC = k } } if UseCC == 0 { log.Fatalln("\nNo binary for GPU. Your nvidia driver may be out-of-date\n") } return UseCC } // check whether compute capability cc works func ccIsOK(cc int) (ok bool) { defer func() { if err := recover(); err == cu.ERROR_NO_BINARY_FOR_GPU { ok = false } }() cu.ModuleLoadData(madd2_map[cc]).GetFunction("madd2") return true } 3-3.11.1/cuda/fft3dc2r.go000066400000000000000000000032351503346766200146360ustar00rootroot00000000000000package cuda import ( "fmt" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/cuda/cufft" "github.com/mumax/3/data" "github.com/mumax/3/timer" ) // 3D single-precision complex-to-real FFT plan. type fft3DC2RPlan struct { fftplan size [3]int } // 3D single-precision complex-to-real FFT plan. func newFFT3DC2R(Nx, Ny, Nz int) fft3DC2RPlan { handle := cufft.Plan3d(Nz, Ny, Nx, cufft.C2R) // new xyz swap handle.SetStream(stream0) return fft3DC2RPlan{fftplan{handle}, [3]int{Nx, Ny, Nz}} } // Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DC2RPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } oksrclen := p.InputLenFloats() if src.Len() != oksrclen { panic(fmt.Errorf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLenFloats() if dst.Len() != okdstlen { panic(fmt.Errorf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecC2R(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } } // 3D size of the input array. func (p *fft3DC2RPlan) InputSizeFloats() (Nx, Ny, Nz int) { return 2 * (p.size[X]/2 + 1), p.size[Y], p.size[Z] } // 3D size of the output array. func (p *fft3DC2RPlan) OutputSizeFloats() (Nx, Ny, Nz int) { return p.size[X], p.size[Y], p.size[Z] } // Required length of the (1D) input array. func (p *fft3DC2RPlan) InputLenFloats() int { return prod3(p.InputSizeFloats()) } // Required length of the (1D) output array. func (p *fft3DC2RPlan) OutputLenFloats() int { return prod3(p.OutputSizeFloats()) } 3-3.11.1/cuda/fft3dr2c.go000066400000000000000000000033071503346766200146360ustar00rootroot00000000000000package cuda import ( "log" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/cuda/cufft" "github.com/mumax/3/data" "github.com/mumax/3/timer" "github.com/mumax/3/util" ) // 3D single-precision real-to-complex FFT plan. type fft3DR2CPlan struct { fftplan size [3]int } // 3D single-precision real-to-complex FFT plan. func newFFT3DR2C(Nx, Ny, Nz int) fft3DR2CPlan { handle := cufft.Plan3d(Nz, Ny, Nx, cufft.R2C) // new xyz swap handle.SetStream(stream0) return fft3DR2CPlan{fftplan{handle}, [3]int{Nx, Ny, Nz}} } // Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DR2CPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } util.Argument(src.NComp() == 1 && dst.NComp() == 1) oksrclen := p.InputLen() if src.Len() != oksrclen { log.Panicf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len()) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { log.Panicf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()) } p.handle.ExecR2C(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } } // 3D size of the input array. func (p *fft3DR2CPlan) InputSizeFloats() (Nx, Ny, Nz int) { return p.size[X], p.size[Y], p.size[Z] } // 3D size of the output array. func (p *fft3DR2CPlan) OutputSizeFloats() (Nx, Ny, Nz int) { return 2 * (p.size[X]/2 + 1), p.size[Y], p.size[Z] } // Required length of the (1D) input array. func (p *fft3DR2CPlan) InputLen() int { return prod3(p.InputSizeFloats()) } // Required length of the (1D) output array. func (p *fft3DR2CPlan) OutputLen() int { return prod3(p.OutputSizeFloats()) } 3-3.11.1/cuda/fftplan.go000066400000000000000000000010511503346766200146450ustar00rootroot00000000000000package cuda // INTERNAL // Base implementation for all FFT plans. import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/cuda/cufft" ) // Base implementation for all FFT plans. type fftplan struct { handle cufft.Handle } func prod3(x, y, z int) int { return x * y * z } // Releases all resources associated with the FFT plan. func (p *fftplan) Free() { if p.handle != 0 { p.handle.Destroy() p.handle = 0 } } // Associates a CUDA stream with the FFT plan. func (p *fftplan) setStream(stream cu.Stream) { p.handle.SetStream(stream) } 3-3.11.1/cuda/float3.h000066400000000000000000000034301503346766200142300ustar00rootroot00000000000000#ifndef _FLOAT3_H_ #define _FLOAT3_H_ // This file implements common functions on float3 (vector). // Author: Mykola Dvornik, Arne Vansteenkiste inline __device__ float3 operator+(float3 a, float3 b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); } inline __device__ void operator+=(float3 &a, float3 b) { a.x += b.x; a.y += b.y; a.z += b.z; } inline __device__ float3 operator-(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } inline __device__ float3 operator-(float3 a) { return make_float3(-a.x, -a.y, -a.z); } inline __device__ void operator-=(float3 &a, float3 b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; } inline __device__ float3 operator*(float s, float3 a) { return make_float3(s*a.x, s*a.y, s*a.z); } inline __device__ float3 operator*(float3 a, float s) { return make_float3(s*a.x, s*a.y, s*a.z); } inline __device__ void operator*=(float3 &a, float s) { a.x *= s; a.y *= s; a.z *= s; } // dot product inline __device__ float dot(float3 a, float3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } // cross product inline __device__ float3 cross(float3 a, float3 b) { return make_float3( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); } // length of the 3-components vector inline __device__ float len(float3 a) { return sqrtf(dot(a,a)); } // returns a normalized copy of the 3-components vector inline __device__ float3 normalized(float3 a){ float veclen = (len(a) != 0.0f) ? 1.0f / len(a) : 0.0f; return veclen * a; } // square inline __device__ float pow2(float x){ return x * x; } // pow(x, 3) inline __device__ float pow3(float x){ return x * x * x; } // pow(x, 4) inline __device__ float pow4(float x){ float s = x*x; return s*s; } #define is0(m) ( dot(m, m) == 0.0f ) #endif 3-3.11.1/cuda/hopf-emergentmagneticfield-solidangle.cu000066400000000000000000000167461503346766200226310ustar00rootroot00000000000000#include #include #include "float3.h" #include "stencil.h" // Returns the topological charge contribution on an elementary triangle ijk // Order of arguments is important here to preserve the same measure of chirality // Note: the result is zero if an argument is zero, or when two arguments are the same __device__ inline float triangleCharge(float3 mi, float3 mj, float3 mk) { float numer = dot(mi, cross(mj, mk)); float denom = 1.0f + dot(mi, mj) + dot(mi, mk) + dot(mj, mk); return 2.0f * atan2(numer, denom); } // Set the emergent magnetic field F_i = (1/8π) ε_{ijk} m · (∂m/∂x_j × ∂m/∂x_k) based on the solid angle // subtended by triangle associated with three spins: a,b,c // // q_{a,b,c} = 2 atan[(a . b x c /(1 + a.b + a.c + b.c)] // // F_i = (1/16) (q_{0,1,2} + q_{0,2,3} + q_{0,3,4} + q_{0,4,1}) // // analogous to the method for calculating the topological charge density in topologicalchargelattice.cu extern "C" __global__ void setemergentmagneticfieldsolidangle(float* __restrict__ Fx, float* __restrict__ Fy, float* __restrict__ Fz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float prefactor, float icycz, float iczcx, float icxcy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int i0 = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[i0], my[i0], mz[i0]); // central cell magnetization if(is0(m0)) { Fx[i0] = 0.0f; Fy[i0] = 0.0f; Fz[i0] = 0.0f; return; } //////// // Fx // //////// // accumulator for Fx float fx = 0.0; // indices of the 4 neighbors (counter clockwise) int i1 = idx(ix, hclampy(iy+1), iz); // (i+1,j) int i2 = idx(ix, iy, hclampz(iz+1)); // (i,j+1) int i3 = idx(ix, lclampy(iy-1), iz); // (i-1,j) int i4 = idx(ix, iy, lclampz(iz-1)); // (i,j-1) // magnetization of the 4 neighbors float3 m1 = make_float3(mx[i1], my[i1], mz[i1]); float3 m2 = make_float3(mx[i2], my[i2], mz[i2]); float3 m3 = make_float3(mx[i3], my[i3], mz[i3]); float3 m4 = make_float3(mx[i4], my[i4], mz[i4]); // contribution from the upper right triangle // if diagonally opposite neighbor is not zero, use a weight of 1/2 to avoid counting charges twice if ((iy+1=0 || PBCy) && (iz+1=0 || PBCy) && (iz-1>=0 || PBCz)) { int i_ = idx(ix, lclampy(iy-1), lclampz(iz-1)); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; fx += weight * triangleCharge(m0, m3, m4); } // bottom right if ((iy+1=0 || PBCz)) { int i_ = idx(ix, hclampy(iy+1), lclampz(iz-1)); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; fx += weight * triangleCharge(m0, m4, m1); } //////// // Fy // //////// // accumulator for Fy float fy = 0.0; // indices of the 4 neighbors (counter clockwise) i1 = idx(ix, iy, hclampz(iz+1)); // (i+1,j) i2 = idx(hclampx(ix+1), iy, iz); // (i,j+1) i3 = idx(ix, iy, lclampz(iz-1)); // (i-1,j) i4 = idx(lclampx(ix-1), iy, iz); // (i,j-1) // magnetization of the 4 neighbors m1 = make_float3(mx[i1], my[i1], mz[i1]); m2 = make_float3(mx[i2], my[i2], mz[i2]); m3 = make_float3(mx[i3], my[i3], mz[i3]); m4 = make_float3(mx[i4], my[i4], mz[i4]); // contribution from the upper right triangle // if diagonally opposite neighbor is not zero, use a weight of 1/2 to avoid counting charges twice if ((iz+1=0 || PBCz) && (ix+1=0 || PBCx) && (iy-1>=0 || PBCy)) { int i_ = idx(lclampx(ix-1), iy, lclampz(iz-1)); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; fy += weight * triangleCharge(m0, m3, m4); } // bottom right if ((ix+1=0 || PBCy)) { int i_ = idx(lclampx(ix-1), iy, hclampz(iz+1)); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; fy += weight * triangleCharge(m0, m4, m1); } //////// // Fz // //////// // accumulator for Fz float fz = 0.0; // indices of the 4 neighbors (counter clockwise) i1 = idx(hclampx(ix+1), iy, iz); // (i+1,j) i2 = idx(ix, hclampy(iy+1), iz); // (i,j+1) i3 = idx(lclampx(ix-1), iy, iz); // (i-1,j) i4 = idx(ix, lclampy(iy-1), iz); // (i,j-1) // magnetization of the 4 neighbors m1 = make_float3(mx[i1], my[i1], mz[i1]); m2 = make_float3(mx[i2], my[i2], mz[i2]); m3 = make_float3(mx[i3], my[i3], mz[i3]); m4 = make_float3(mx[i4], my[i4], mz[i4]); // contribution from the upper right triangle // if diagonally opposite neighbor is not zero, use a weight of 1/2 to avoid counting charges twice if ((ix+1=0 || PBCx) && (iy+1=0 || PBCx) && (iy-1>=0 || PBCy)) { int i_ = idx(lclampx(ix-1), lclampy(iy-1), iz); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; fz += weight * triangleCharge(m0, m3, m4); } // bottom right if ((ix+1=0 || PBCy)) { int i_ = idx(hclampx(ix+1), lclampy(iy-1), iz); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; fz += weight * triangleCharge(m0, m4, m1); } Fx[i0] = 2 * prefactor * icycz * fx; Fy[i0] = 2 * prefactor * iczcx * fy; Fz[i0] = 2 * prefactor * icxcy * fz; } 3-3.11.1/cuda/hopf-emergentmagneticfield-solidangle_wrapper.go000066400000000000000000031662151503346766200243670ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setemergentmagneticfieldsolidangle kernel var setemergentmagneticfieldsolidangle_code cu.Function // Stores the arguments for setemergentmagneticfieldsolidangle kernel invocation type setemergentmagneticfieldsolidangle_args_t struct { arg_Fx unsafe.Pointer arg_Fy unsafe.Pointer arg_Fz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_prefactor float32 arg_icycz float32 arg_iczcx float32 arg_icxcy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [14]unsafe.Pointer sync.Mutex } // Stores the arguments for setemergentmagneticfieldsolidangle kernel invocation var setemergentmagneticfieldsolidangle_args setemergentmagneticfieldsolidangle_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setemergentmagneticfieldsolidangle_args.argptr[0] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_Fx) setemergentmagneticfieldsolidangle_args.argptr[1] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_Fy) setemergentmagneticfieldsolidangle_args.argptr[2] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_Fz) setemergentmagneticfieldsolidangle_args.argptr[3] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_mx) setemergentmagneticfieldsolidangle_args.argptr[4] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_my) setemergentmagneticfieldsolidangle_args.argptr[5] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_mz) setemergentmagneticfieldsolidangle_args.argptr[6] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_prefactor) setemergentmagneticfieldsolidangle_args.argptr[7] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_icycz) setemergentmagneticfieldsolidangle_args.argptr[8] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_iczcx) setemergentmagneticfieldsolidangle_args.argptr[9] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_icxcy) setemergentmagneticfieldsolidangle_args.argptr[10] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_Nx) setemergentmagneticfieldsolidangle_args.argptr[11] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_Ny) setemergentmagneticfieldsolidangle_args.argptr[12] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_Nz) setemergentmagneticfieldsolidangle_args.argptr[13] = unsafe.Pointer(&setemergentmagneticfieldsolidangle_args.arg_PBC) } // Wrapper for setemergentmagneticfieldsolidangle CUDA kernel, asynchronous. func k_setemergentmagneticfieldsolidangle_async(Fx unsafe.Pointer, Fy unsafe.Pointer, Fz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, prefactor float32, icycz float32, iczcx float32, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("setemergentmagneticfieldsolidangle") } setemergentmagneticfieldsolidangle_args.Lock() defer setemergentmagneticfieldsolidangle_args.Unlock() if setemergentmagneticfieldsolidangle_code == 0 { setemergentmagneticfieldsolidangle_code = fatbinLoad(setemergentmagneticfieldsolidangle_map, "setemergentmagneticfieldsolidangle") } setemergentmagneticfieldsolidangle_args.arg_Fx = Fx setemergentmagneticfieldsolidangle_args.arg_Fy = Fy setemergentmagneticfieldsolidangle_args.arg_Fz = Fz setemergentmagneticfieldsolidangle_args.arg_mx = mx setemergentmagneticfieldsolidangle_args.arg_my = my setemergentmagneticfieldsolidangle_args.arg_mz = mz setemergentmagneticfieldsolidangle_args.arg_prefactor = prefactor setemergentmagneticfieldsolidangle_args.arg_icycz = icycz setemergentmagneticfieldsolidangle_args.arg_iczcx = iczcx setemergentmagneticfieldsolidangle_args.arg_icxcy = icxcy setemergentmagneticfieldsolidangle_args.arg_Nx = Nx setemergentmagneticfieldsolidangle_args.arg_Ny = Ny setemergentmagneticfieldsolidangle_args.arg_Nz = Nz setemergentmagneticfieldsolidangle_args.arg_PBC = PBC args := setemergentmagneticfieldsolidangle_args.argptr[:] cu.LaunchKernel(setemergentmagneticfieldsolidangle_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setemergentmagneticfieldsolidangle") } } // maps compute capability on PTX code for setemergentmagneticfieldsolidangle kernel. var setemergentmagneticfieldsolidangle_map = map[int]string{0: "", 50: setemergentmagneticfieldsolidangle_ptx_50, 52: setemergentmagneticfieldsolidangle_ptx_52, 53: setemergentmagneticfieldsolidangle_ptx_53, 60: setemergentmagneticfieldsolidangle_ptx_60, 61: setemergentmagneticfieldsolidangle_ptx_61, 62: setemergentmagneticfieldsolidangle_ptx_62, 70: setemergentmagneticfieldsolidangle_ptx_70, 72: setemergentmagneticfieldsolidangle_ptx_72, 75: setemergentmagneticfieldsolidangle_ptx_75, 80: setemergentmagneticfieldsolidangle_ptx_80, 86: setemergentmagneticfieldsolidangle_ptx_86, 87: setemergentmagneticfieldsolidangle_ptx_87, 89: setemergentmagneticfieldsolidangle_ptx_89, 90: setemergentmagneticfieldsolidangle_ptx_90} // setemergentmagneticfieldsolidangle PTX code for various compute capabilities. const ( setemergentmagneticfieldsolidangle_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` setemergentmagneticfieldsolidangle_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl setemergentmagneticfieldsolidangle .visible .entry setemergentmagneticfieldsolidangle( .param .u64 setemergentmagneticfieldsolidangle_param_0, .param .u64 setemergentmagneticfieldsolidangle_param_1, .param .u64 setemergentmagneticfieldsolidangle_param_2, .param .u64 setemergentmagneticfieldsolidangle_param_3, .param .u64 setemergentmagneticfieldsolidangle_param_4, .param .u64 setemergentmagneticfieldsolidangle_param_5, .param .f32 setemergentmagneticfieldsolidangle_param_6, .param .f32 setemergentmagneticfieldsolidangle_param_7, .param .f32 setemergentmagneticfieldsolidangle_param_8, .param .f32 setemergentmagneticfieldsolidangle_param_9, .param .u32 setemergentmagneticfieldsolidangle_param_10, .param .u32 setemergentmagneticfieldsolidangle_param_11, .param .u32 setemergentmagneticfieldsolidangle_param_12, .param .u8 setemergentmagneticfieldsolidangle_param_13 ) { .reg .pred %p<225>; .reg .b16 %rs<8>; .reg .f32 %f<881>; .reg .b32 %r<518>; .reg .b64 %rd<124>; ld.param.u8 %rs4, [setemergentmagneticfieldsolidangle_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldsolidangle_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldsolidangle_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldsolidangle_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldsolidangle_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldsolidangle_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldsolidangle_param_5]; ld.param.u32 %r124, [setemergentmagneticfieldsolidangle_param_10]; ld.param.u32 %r125, [setemergentmagneticfieldsolidangle_param_11]; ld.param.u32 %r126, [setemergentmagneticfieldsolidangle_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r127, %ntid.x; mov.u32 %r128, %ctaid.x; mov.u32 %r129, %tid.x; mad.lo.s32 %r1, %r128, %r127, %r129; mov.u32 %r130, %ntid.y; mov.u32 %r131, %ctaid.y; mov.u32 %r132, %tid.y; mad.lo.s32 %r2, %r131, %r130, %r132; mov.u32 %r133, %ntid.z; mov.u32 %r134, %ctaid.z; mov.u32 %r135, %tid.z; mad.lo.s32 %r3, %r134, %r133, %r135; setp.ge.s32 %p6, %r1, %r124; setp.ge.s32 %p7, %r2, %r125; or.pred %p8, %p6, %p7; setp.ge.s32 %p9, %r3, %r126; or.pred %p10, %p8, %p9; @%p10 bra $L__BB0_202; mul.lo.s32 %r4, %r3, %r125; add.s32 %r136, %r4, %r2; mul.lo.s32 %r5, %r136, %r124; add.s32 %r137, %r5, %r1; mul.wide.s32 %rd13, %r137, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f176, %f2, %f2; fma.rn.f32 %f177, %f1, %f1, %f176; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f178, %f3, %f3, %f177; setp.eq.f32 %p11, %f178, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p11 bra $L__BB0_201; bra.uni $L__BB0_2; $L__BB0_201: mov.u32 %r460, 0; st.global.u32 [%rd4], %r460; st.global.u32 [%rd5], %r460; st.global.u32 [%rd6], %r460; bra.uni $L__BB0_202; $L__BB0_2: and.b16 %rs1, %rs4, 2; setp.eq.s16 %p12, %rs1, 0; add.s32 %r6, %r2, 1; @%p12 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r140, %r125, -1; min.s32 %r482, %r6, %r140; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r138, %r6, %r125; add.s32 %r139, %r138, %r125; rem.s32 %r482, %r139, %r125; $L__BB0_5: and.b16 %rs2, %rs4, 4; setp.eq.s16 %p13, %rs2, 0; add.s32 %r10, %r3, 1; @%p13 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r143, %r126, -1; min.s32 %r483, %r10, %r143; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r141, %r10, %r126; add.s32 %r142, %r141, %r126; rem.s32 %r483, %r142, %r126; $L__BB0_8: add.s32 %r14, %r2, -1; @%p12 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r484, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r144, %r14, %r125; add.s32 %r145, %r144, %r125; rem.s32 %r484, %r145, %r125; $L__BB0_11: add.s32 %r18, %r3, -1; @%p13 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r485, %r18, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r146, %r18, %r126; add.s32 %r147, %r146, %r126; rem.s32 %r485, %r147, %r126; $L__BB0_14: mad.lo.s32 %r148, %r485, %r125, %r2; mad.lo.s32 %r149, %r148, %r124, %r1; add.s32 %r150, %r482, %r4; mad.lo.s32 %r151, %r150, %r124, %r1; mul.wide.s32 %rd20, %r151, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f4, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f5, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f6, [%rd23]; mad.lo.s32 %r152, %r483, %r125, %r2; mad.lo.s32 %r153, %r152, %r124, %r1; mul.wide.s32 %rd24, %r153, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f7, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f8, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f9, [%rd27]; add.s32 %r154, %r484, %r4; mad.lo.s32 %r155, %r154, %r124, %r1; mul.wide.s32 %rd28, %r155, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f10, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f12, [%rd31]; mul.wide.s32 %rd32, %r149, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f13, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f14, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f15, [%rd35]; setp.ne.s16 %p16, %rs1, 0; setp.lt.s32 %p17, %r6, %r125; or.pred %p1, %p17, %p16; not.pred %p18, %p1; mov.f32 %f862, 0f00000000; @%p18 bra $L__BB0_28; setp.ge.s32 %p19, %r10, %r126; and.pred %p21, %p19, %p13; @%p21 bra $L__BB0_28; @%p13 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r158, %r126, -1; min.s32 %r486, %r10, %r158; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r156, %r10, %r126; add.s32 %r157, %r156, %r126; rem.s32 %r486, %r157, %r126; $L__BB0_19: @%p12 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r161, %r125, -1; min.s32 %r487, %r6, %r161; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r159, %r6, %r125; add.s32 %r160, %r159, %r125; rem.s32 %r487, %r160, %r125; $L__BB0_22: mad.lo.s32 %r162, %r486, %r125, %r487; mad.lo.s32 %r163, %r162, %r124, %r1; mul.wide.s32 %rd36, %r163, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f181, [%rd37]; ld.global.nc.f32 %f182, [%rd38]; mul.f32 %f183, %f182, %f182; fma.rn.f32 %f184, %f181, %f181, %f183; ld.global.nc.f32 %f185, [%rd39]; fma.rn.f32 %f16, %f185, %f185, %f184; mul.f32 %f186, %f6, %f8; mul.f32 %f187, %f5, %f9; sub.f32 %f188, %f187, %f186; mul.f32 %f189, %f4, %f9; mul.f32 %f190, %f6, %f7; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f5, %f7; mul.f32 %f193, %f4, %f8; sub.f32 %f194, %f193, %f192; mul.f32 %f195, %f2, %f191; fma.rn.f32 %f196, %f1, %f188, %f195; fma.rn.f32 %f17, %f3, %f194, %f196; mul.f32 %f197, %f2, %f5; fma.rn.f32 %f198, %f1, %f4, %f197; fma.rn.f32 %f199, %f3, %f6, %f198; add.f32 %f200, %f199, 0f3F800000; mul.f32 %f201, %f2, %f8; fma.rn.f32 %f202, %f1, %f7, %f201; fma.rn.f32 %f203, %f3, %f9, %f202; add.f32 %f204, %f200, %f203; mul.f32 %f205, %f5, %f8; fma.rn.f32 %f206, %f4, %f7, %f205; fma.rn.f32 %f207, %f6, %f9, %f206; add.f32 %f18, %f207, %f204; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p24, %f19, 0f00000000; setp.eq.f32 %p25, %f20, 0f00000000; and.pred %p26, %p24, %p25; @%p26 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r174, %f18; shr.s32 %r175, %r174, 31; and.b32 %r176, %r175, 1078530011; mov.b32 %r177, %f17; and.b32 %r178, %r177, -2147483648; or.b32 %r179, %r178, %r176; mov.b32 %f857, %r179; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p27, %f19, 0f7F800000; setp.eq.f32 %p28, %f20, 0f7F800000; and.pred %p29, %p27, %p28; @%p29 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r169, %f18; setp.lt.s32 %p33, %r169, 0; selp.b32 %r170, 1075235812, 1061752795, %p33; mov.b32 %r171, %f17; and.b32 %r172, %r171, -2147483648; or.b32 %r173, %r172, %r170; mov.b32 %f857, %r173; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f208, %f20, %f19; min.f32 %f209, %f20, %f19; div.rn.f32 %f210, %f209, %f208; mul.rn.f32 %f211, %f210, %f210; mov.f32 %f212, 0fC0B59883; mov.f32 %f213, 0fBF52C7EA; fma.rn.f32 %f214, %f211, %f213, %f212; mov.f32 %f215, 0fC0D21907; fma.rn.f32 %f216, %f214, %f211, %f215; mul.f32 %f217, %f211, %f216; mul.f32 %f218, %f210, %f217; add.f32 %f219, %f211, 0f41355DC0; mov.f32 %f220, 0f41E6BD60; fma.rn.f32 %f221, %f219, %f211, %f220; mov.f32 %f222, 0f419D92C8; fma.rn.f32 %f223, %f221, %f211, %f222; rcp.rn.f32 %f224, %f223; fma.rn.f32 %f225, %f218, %f224, %f210; mov.f32 %f226, 0f3FC90FDB; sub.f32 %f227, %f226, %f225; setp.gt.f32 %p30, %f20, %f19; selp.f32 %f228, %f227, %f225, %p30; mov.b32 %r164, %f18; setp.lt.s32 %p31, %r164, 0; mov.f32 %f229, 0f40490FDB; sub.f32 %f230, %f229, %f228; selp.f32 %f231, %f230, %f228, %p31; mov.b32 %r165, %f231; mov.b32 %r166, %f17; and.b32 %r167, %r166, -2147483648; or.b32 %r168, %r167, %r165; mov.b32 %f232, %r168; add.f32 %f233, %f19, %f20; setp.le.f32 %p32, %f233, 0f7F800000; selp.f32 %f857, %f232, %f233, %p32; $L__BB0_27: add.f32 %f234, %f857, %f857; setp.eq.f32 %p34, %f16, 0f00000000; selp.f32 %f235, 0f3F800000, 0f3F000000, %p34; fma.rn.f32 %f862, %f235, %f234, 0f00000000; $L__BB0_28: setp.gt.s32 %p35, %r2, 0; or.pred %p2, %p35, %p16; not.pred %p37, %p2; @%p37 bra $L__BB0_55; setp.ge.s32 %p38, %r10, %r126; and.pred %p40, %p38, %p13; @%p40 bra $L__BB0_42; @%p13 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r182, %r126, -1; min.s32 %r488, %r10, %r182; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r180, %r10, %r126; add.s32 %r181, %r180, %r126; rem.s32 %r488, %r181, %r126; $L__BB0_33: @%p12 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r489, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r183, %r14, %r125; add.s32 %r184, %r183, %r125; rem.s32 %r489, %r184, %r125; $L__BB0_36: mad.lo.s32 %r185, %r488, %r125, %r489; mad.lo.s32 %r186, %r185, %r124, %r1; mul.wide.s32 %rd40, %r186, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f236, [%rd41]; ld.global.nc.f32 %f237, [%rd42]; mul.f32 %f238, %f237, %f237; fma.rn.f32 %f239, %f236, %f236, %f238; ld.global.nc.f32 %f240, [%rd43]; fma.rn.f32 %f27, %f240, %f240, %f239; mul.f32 %f241, %f9, %f11; mul.f32 %f242, %f8, %f12; sub.f32 %f243, %f242, %f241; mul.f32 %f244, %f7, %f12; mul.f32 %f245, %f9, %f10; sub.f32 %f246, %f245, %f244; mul.f32 %f247, %f8, %f10; mul.f32 %f248, %f7, %f11; sub.f32 %f249, %f248, %f247; mul.f32 %f250, %f2, %f246; fma.rn.f32 %f251, %f1, %f243, %f250; fma.rn.f32 %f28, %f3, %f249, %f251; mul.f32 %f252, %f2, %f8; fma.rn.f32 %f253, %f1, %f7, %f252; fma.rn.f32 %f254, %f3, %f9, %f253; add.f32 %f255, %f254, 0f3F800000; mul.f32 %f256, %f2, %f11; fma.rn.f32 %f257, %f1, %f10, %f256; fma.rn.f32 %f258, %f3, %f12, %f257; add.f32 %f259, %f255, %f258; mul.f32 %f260, %f8, %f11; fma.rn.f32 %f261, %f7, %f10, %f260; fma.rn.f32 %f262, %f9, %f12, %f261; add.f32 %f29, %f262, %f259; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p43, %f30, 0f00000000; setp.eq.f32 %p44, %f31, 0f00000000; and.pred %p45, %p43, %p44; @%p45 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r197, %f29; shr.s32 %r198, %r197, 31; and.b32 %r199, %r198, 1078530011; mov.b32 %r200, %f28; and.b32 %r201, %r200, -2147483648; or.b32 %r202, %r201, %r199; mov.b32 %f859, %r202; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p46, %f30, 0f7F800000; setp.eq.f32 %p47, %f31, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r192, %f29; setp.lt.s32 %p52, %r192, 0; selp.b32 %r193, 1075235812, 1061752795, %p52; mov.b32 %r194, %f28; and.b32 %r195, %r194, -2147483648; or.b32 %r196, %r195, %r193; mov.b32 %f859, %r196; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f263, %f31, %f30; min.f32 %f264, %f31, %f30; div.rn.f32 %f265, %f264, %f263; mul.rn.f32 %f266, %f265, %f265; mov.f32 %f267, 0fC0B59883; mov.f32 %f268, 0fBF52C7EA; fma.rn.f32 %f269, %f266, %f268, %f267; mov.f32 %f270, 0fC0D21907; fma.rn.f32 %f271, %f269, %f266, %f270; mul.f32 %f272, %f266, %f271; mul.f32 %f273, %f265, %f272; add.f32 %f274, %f266, 0f41355DC0; mov.f32 %f275, 0f41E6BD60; fma.rn.f32 %f276, %f274, %f266, %f275; mov.f32 %f277, 0f419D92C8; fma.rn.f32 %f278, %f276, %f266, %f277; rcp.rn.f32 %f279, %f278; fma.rn.f32 %f280, %f273, %f279, %f265; mov.f32 %f281, 0f3FC90FDB; sub.f32 %f282, %f281, %f280; setp.gt.f32 %p49, %f31, %f30; selp.f32 %f283, %f282, %f280, %p49; mov.b32 %r187, %f29; setp.lt.s32 %p50, %r187, 0; mov.f32 %f284, 0f40490FDB; sub.f32 %f285, %f284, %f283; selp.f32 %f286, %f285, %f283, %p50; mov.b32 %r188, %f286; mov.b32 %r189, %f28; and.b32 %r190, %r189, -2147483648; or.b32 %r191, %r190, %r188; mov.b32 %f287, %r191; add.f32 %f288, %f30, %f31; setp.le.f32 %p51, %f288, 0f7F800000; selp.f32 %f859, %f287, %f288, %p51; $L__BB0_41: add.f32 %f289, %f859, %f859; setp.eq.f32 %p53, %f27, 0f00000000; selp.f32 %f290, 0f3F800000, 0f3F000000, %p53; fma.rn.f32 %f862, %f290, %f289, %f862; $L__BB0_42: setp.lt.s32 %p54, %r3, 1; and.pred %p56, %p54, %p13; @%p56 bra $L__BB0_55; @%p13 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r490, %r18, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r203, %r18, %r126; add.s32 %r204, %r203, %r126; rem.s32 %r490, %r204, %r126; $L__BB0_46: @%p12 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r491, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r205, %r14, %r125; add.s32 %r206, %r205, %r125; rem.s32 %r491, %r206, %r125; $L__BB0_49: mad.lo.s32 %r207, %r490, %r125, %r491; mad.lo.s32 %r208, %r207, %r124, %r1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f291, [%rd45]; ld.global.nc.f32 %f292, [%rd46]; mul.f32 %f293, %f292, %f292; fma.rn.f32 %f294, %f291, %f291, %f293; ld.global.nc.f32 %f295, [%rd47]; fma.rn.f32 %f38, %f295, %f295, %f294; mul.f32 %f296, %f12, %f14; mul.f32 %f297, %f11, %f15; sub.f32 %f298, %f297, %f296; mul.f32 %f299, %f10, %f15; mul.f32 %f300, %f12, %f13; sub.f32 %f301, %f300, %f299; mul.f32 %f302, %f11, %f13; mul.f32 %f303, %f10, %f14; sub.f32 %f304, %f303, %f302; mul.f32 %f305, %f2, %f301; fma.rn.f32 %f306, %f1, %f298, %f305; fma.rn.f32 %f39, %f3, %f304, %f306; mul.f32 %f307, %f2, %f11; fma.rn.f32 %f308, %f1, %f10, %f307; fma.rn.f32 %f309, %f3, %f12, %f308; add.f32 %f310, %f309, 0f3F800000; mul.f32 %f311, %f2, %f14; fma.rn.f32 %f312, %f1, %f13, %f311; fma.rn.f32 %f313, %f3, %f15, %f312; add.f32 %f314, %f310, %f313; mul.f32 %f315, %f11, %f14; fma.rn.f32 %f316, %f10, %f13, %f315; fma.rn.f32 %f317, %f12, %f15, %f316; add.f32 %f40, %f317, %f314; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p59, %f41, 0f00000000; setp.eq.f32 %p60, %f42, 0f00000000; and.pred %p61, %p59, %p60; @%p61 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r219, %f40; shr.s32 %r220, %r219, 31; and.b32 %r221, %r220, 1078530011; mov.b32 %r222, %f39; and.b32 %r223, %r222, -2147483648; or.b32 %r224, %r223, %r221; mov.b32 %f861, %r224; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p62, %f41, 0f7F800000; setp.eq.f32 %p63, %f42, 0f7F800000; and.pred %p64, %p62, %p63; @%p64 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r214, %f40; setp.lt.s32 %p68, %r214, 0; selp.b32 %r215, 1075235812, 1061752795, %p68; mov.b32 %r216, %f39; and.b32 %r217, %r216, -2147483648; or.b32 %r218, %r217, %r215; mov.b32 %f861, %r218; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f318, %f42, %f41; min.f32 %f319, %f42, %f41; div.rn.f32 %f320, %f319, %f318; mul.rn.f32 %f321, %f320, %f320; mov.f32 %f322, 0fC0B59883; mov.f32 %f323, 0fBF52C7EA; fma.rn.f32 %f324, %f321, %f323, %f322; mov.f32 %f325, 0fC0D21907; fma.rn.f32 %f326, %f324, %f321, %f325; mul.f32 %f327, %f321, %f326; mul.f32 %f328, %f320, %f327; add.f32 %f329, %f321, 0f41355DC0; mov.f32 %f330, 0f41E6BD60; fma.rn.f32 %f331, %f329, %f321, %f330; mov.f32 %f332, 0f419D92C8; fma.rn.f32 %f333, %f331, %f321, %f332; rcp.rn.f32 %f334, %f333; fma.rn.f32 %f335, %f328, %f334, %f320; mov.f32 %f336, 0f3FC90FDB; sub.f32 %f337, %f336, %f335; setp.gt.f32 %p65, %f42, %f41; selp.f32 %f338, %f337, %f335, %p65; mov.b32 %r209, %f40; setp.lt.s32 %p66, %r209, 0; mov.f32 %f339, 0f40490FDB; sub.f32 %f340, %f339, %f338; selp.f32 %f341, %f340, %f338, %p66; mov.b32 %r210, %f341; mov.b32 %r211, %f39; and.b32 %r212, %r211, -2147483648; or.b32 %r213, %r212, %r210; mov.b32 %f342, %r213; add.f32 %f343, %f41, %f42; setp.le.f32 %p67, %f343, 0f7F800000; selp.f32 %f861, %f342, %f343, %p67; $L__BB0_54: add.f32 %f344, %f861, %f861; setp.eq.f32 %p69, %f38, 0f00000000; selp.f32 %f345, 0f3F800000, 0f3F000000, %p69; fma.rn.f32 %f862, %f345, %f344, %f862; $L__BB0_55: @%p18 bra $L__BB0_69; setp.lt.s32 %p71, %r3, 1; and.pred %p73, %p71, %p13; @%p73 bra $L__BB0_69; @%p13 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r492, %r18, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r225, %r18, %r126; add.s32 %r226, %r225, %r126; rem.s32 %r492, %r226, %r126; $L__BB0_60: @%p12 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r229, %r125, -1; min.s32 %r493, %r6, %r229; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r227, %r6, %r125; add.s32 %r228, %r227, %r125; rem.s32 %r493, %r228, %r125; $L__BB0_63: mad.lo.s32 %r230, %r492, %r125, %r493; mad.lo.s32 %r231, %r230, %r124, %r1; mul.wide.s32 %rd48, %r231, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f346, [%rd49]; ld.global.nc.f32 %f347, [%rd50]; mul.f32 %f348, %f347, %f347; fma.rn.f32 %f349, %f346, %f346, %f348; ld.global.nc.f32 %f350, [%rd51]; fma.rn.f32 %f49, %f350, %f350, %f349; mul.f32 %f351, %f5, %f15; mul.f32 %f352, %f6, %f14; sub.f32 %f353, %f352, %f351; mul.f32 %f354, %f6, %f13; mul.f32 %f355, %f4, %f15; sub.f32 %f356, %f355, %f354; mul.f32 %f357, %f4, %f14; mul.f32 %f358, %f5, %f13; sub.f32 %f359, %f358, %f357; mul.f32 %f360, %f2, %f356; fma.rn.f32 %f361, %f1, %f353, %f360; fma.rn.f32 %f50, %f3, %f359, %f361; mul.f32 %f362, %f2, %f14; fma.rn.f32 %f363, %f1, %f13, %f362; fma.rn.f32 %f364, %f3, %f15, %f363; add.f32 %f365, %f364, 0f3F800000; mul.f32 %f366, %f2, %f5; fma.rn.f32 %f367, %f1, %f4, %f366; fma.rn.f32 %f368, %f3, %f6, %f367; add.f32 %f369, %f368, %f365; mul.f32 %f370, %f5, %f14; fma.rn.f32 %f371, %f4, %f13, %f370; fma.rn.f32 %f372, %f6, %f15, %f371; add.f32 %f51, %f372, %f369; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p76, %f52, 0f00000000; setp.eq.f32 %p77, %f53, 0f00000000; and.pred %p78, %p76, %p77; @%p78 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r242, %f51; shr.s32 %r243, %r242, 31; and.b32 %r244, %r243, 1078530011; mov.b32 %r245, %f50; and.b32 %r246, %r245, -2147483648; or.b32 %r247, %r244, %r246; mov.b32 %f863, %r247; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p79, %f52, 0f7F800000; setp.eq.f32 %p80, %f53, 0f7F800000; and.pred %p81, %p79, %p80; @%p81 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r237, %f51; setp.lt.s32 %p85, %r237, 0; selp.b32 %r238, 1075235812, 1061752795, %p85; mov.b32 %r239, %f50; and.b32 %r240, %r239, -2147483648; or.b32 %r241, %r238, %r240; mov.b32 %f863, %r241; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f373, %f53, %f52; min.f32 %f374, %f53, %f52; div.rn.f32 %f375, %f374, %f373; mul.rn.f32 %f376, %f375, %f375; mov.f32 %f377, 0fC0B59883; mov.f32 %f378, 0fBF52C7EA; fma.rn.f32 %f379, %f376, %f378, %f377; mov.f32 %f380, 0fC0D21907; fma.rn.f32 %f381, %f379, %f376, %f380; mul.f32 %f382, %f376, %f381; mul.f32 %f383, %f375, %f382; add.f32 %f384, %f376, 0f41355DC0; mov.f32 %f385, 0f41E6BD60; fma.rn.f32 %f386, %f384, %f376, %f385; mov.f32 %f387, 0f419D92C8; fma.rn.f32 %f388, %f386, %f376, %f387; rcp.rn.f32 %f389, %f388; fma.rn.f32 %f390, %f383, %f389, %f375; mov.f32 %f391, 0f3FC90FDB; sub.f32 %f392, %f391, %f390; setp.gt.f32 %p82, %f53, %f52; selp.f32 %f393, %f392, %f390, %p82; mov.b32 %r232, %f51; setp.lt.s32 %p83, %r232, 0; mov.f32 %f394, 0f40490FDB; sub.f32 %f395, %f394, %f393; selp.f32 %f396, %f395, %f393, %p83; mov.b32 %r233, %f396; mov.b32 %r234, %f50; and.b32 %r235, %r234, -2147483648; or.b32 %r236, %r235, %r233; mov.b32 %f397, %r236; add.f32 %f398, %f52, %f53; setp.le.f32 %p84, %f398, 0f7F800000; selp.f32 %f863, %f397, %f398, %p84; $L__BB0_68: add.f32 %f399, %f863, %f863; setp.eq.f32 %p86, %f49, 0f00000000; selp.f32 %f400, 0f3F800000, 0f3F000000, %p86; fma.rn.f32 %f862, %f400, %f399, %f862; $L__BB0_69: @%p13 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: add.s32 %r250, %r126, -1; min.s32 %r494, %r10, %r250; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r248, %r10, %r126; add.s32 %r249, %r248, %r126; rem.s32 %r494, %r249, %r126; $L__BB0_72: ld.param.u8 %rs7, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs3, %rs7, 1; setp.eq.s16 %p88, %rs3, 0; add.s32 %r49, %r1, 1; @%p88 bra $L__BB0_74; bra.uni $L__BB0_73; $L__BB0_74: add.s32 %r253, %r124, -1; min.s32 %r495, %r49, %r253; bra.uni $L__BB0_75; $L__BB0_73: rem.s32 %r251, %r49, %r124; add.s32 %r252, %r251, %r124; rem.s32 %r495, %r252, %r124; $L__BB0_75: @%p13 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r496, %r18, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r254, %r18, %r126; add.s32 %r255, %r254, %r126; rem.s32 %r496, %r255, %r126; $L__BB0_78: add.s32 %r56, %r1, -1; @%p88 bra $L__BB0_80; bra.uni $L__BB0_79; $L__BB0_80: max.s32 %r497, %r56, 0; bra.uni $L__BB0_81; $L__BB0_79: rem.s32 %r256, %r56, %r124; add.s32 %r257, %r256, %r124; rem.s32 %r497, %r257, %r124; $L__BB0_81: mad.lo.s32 %r258, %r494, %r125, %r2; mad.lo.s32 %r259, %r258, %r124, %r1; mul.wide.s32 %rd52, %r259, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f60, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f61, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f62, [%rd55]; add.s32 %r260, %r495, %r5; mul.wide.s32 %rd56, %r260, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f63, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f64, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f65, [%rd59]; mad.lo.s32 %r261, %r496, %r125, %r2; mad.lo.s32 %r262, %r261, %r124, %r1; mul.wide.s32 %rd60, %r262, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f66, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f67, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f68, [%rd63]; add.s32 %r263, %r497, %r5; mul.wide.s32 %rd64, %r263, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f69, [%rd65]; add.s64 %rd66, %rd2, %rd64; ld.global.nc.f32 %f70, [%rd66]; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f71, [%rd67]; setp.ge.s32 %p92, %r10, %r126; and.pred %p93, %p92, %p13; mov.f32 %f868, 0f00000000; @%p93 bra $L__BB0_95; setp.ge.s32 %p94, %r49, %r124; and.pred %p96, %p94, %p88; @%p96 bra $L__BB0_95; @%p13 bra $L__BB0_85; bra.uni $L__BB0_84; $L__BB0_85: add.s32 %r266, %r126, -1; min.s32 %r498, %r10, %r266; bra.uni $L__BB0_86; $L__BB0_84: rem.s32 %r264, %r10, %r126; add.s32 %r265, %r264, %r126; rem.s32 %r498, %r265, %r126; $L__BB0_86: @%p88 bra $L__BB0_88; bra.uni $L__BB0_87; $L__BB0_88: add.s32 %r269, %r124, -1; min.s32 %r499, %r49, %r269; bra.uni $L__BB0_89; $L__BB0_87: rem.s32 %r267, %r49, %r124; add.s32 %r268, %r267, %r124; rem.s32 %r499, %r268, %r124; $L__BB0_89: mad.lo.s32 %r270, %r498, %r125, %r2; mad.lo.s32 %r271, %r270, %r124, %r499; mul.wide.s32 %rd68, %r271, 4; add.s64 %rd69, %rd3, %rd68; add.s64 %rd70, %rd2, %rd68; add.s64 %rd71, %rd1, %rd68; ld.global.nc.f32 %f403, [%rd69]; ld.global.nc.f32 %f404, [%rd70]; mul.f32 %f405, %f404, %f404; fma.rn.f32 %f406, %f403, %f403, %f405; ld.global.nc.f32 %f407, [%rd71]; fma.rn.f32 %f72, %f407, %f407, %f406; mul.f32 %f408, %f62, %f64; mul.f32 %f409, %f61, %f65; sub.f32 %f410, %f409, %f408; mul.f32 %f411, %f60, %f65; mul.f32 %f412, %f62, %f63; sub.f32 %f413, %f412, %f411; mul.f32 %f414, %f61, %f63; mul.f32 %f415, %f60, %f64; sub.f32 %f416, %f415, %f414; mul.f32 %f417, %f2, %f413; fma.rn.f32 %f418, %f1, %f410, %f417; fma.rn.f32 %f73, %f3, %f416, %f418; mul.f32 %f419, %f2, %f61; fma.rn.f32 %f420, %f1, %f60, %f419; fma.rn.f32 %f421, %f3, %f62, %f420; add.f32 %f422, %f421, 0f3F800000; mul.f32 %f423, %f2, %f64; fma.rn.f32 %f424, %f1, %f63, %f423; fma.rn.f32 %f425, %f3, %f65, %f424; add.f32 %f426, %f422, %f425; mul.f32 %f427, %f61, %f64; fma.rn.f32 %f428, %f60, %f63, %f427; fma.rn.f32 %f429, %f62, %f65, %f428; add.f32 %f74, %f429, %f426; abs.f32 %f75, %f74; abs.f32 %f76, %f73; setp.eq.f32 %p99, %f75, 0f00000000; setp.eq.f32 %p100, %f76, 0f00000000; and.pred %p101, %p99, %p100; @%p101 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: mov.b32 %r282, %f74; shr.s32 %r283, %r282, 31; and.b32 %r284, %r283, 1078530011; mov.b32 %r285, %f73; and.b32 %r286, %r285, -2147483648; or.b32 %r287, %r286, %r284; mov.b32 %f865, %r287; bra.uni $L__BB0_94; $L__BB0_90: setp.eq.f32 %p102, %f75, 0f7F800000; setp.eq.f32 %p103, %f76, 0f7F800000; and.pred %p104, %p102, %p103; @%p104 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: mov.b32 %r277, %f74; setp.lt.s32 %p108, %r277, 0; selp.b32 %r278, 1075235812, 1061752795, %p108; mov.b32 %r279, %f73; and.b32 %r280, %r279, -2147483648; or.b32 %r281, %r280, %r278; mov.b32 %f865, %r281; bra.uni $L__BB0_94; $L__BB0_91: max.f32 %f430, %f76, %f75; min.f32 %f431, %f76, %f75; div.rn.f32 %f432, %f431, %f430; mul.rn.f32 %f433, %f432, %f432; mov.f32 %f434, 0fC0B59883; mov.f32 %f435, 0fBF52C7EA; fma.rn.f32 %f436, %f433, %f435, %f434; mov.f32 %f437, 0fC0D21907; fma.rn.f32 %f438, %f436, %f433, %f437; mul.f32 %f439, %f433, %f438; mul.f32 %f440, %f432, %f439; add.f32 %f441, %f433, 0f41355DC0; mov.f32 %f442, 0f41E6BD60; fma.rn.f32 %f443, %f441, %f433, %f442; mov.f32 %f444, 0f419D92C8; fma.rn.f32 %f445, %f443, %f433, %f444; rcp.rn.f32 %f446, %f445; fma.rn.f32 %f447, %f440, %f446, %f432; mov.f32 %f448, 0f3FC90FDB; sub.f32 %f449, %f448, %f447; setp.gt.f32 %p105, %f76, %f75; selp.f32 %f450, %f449, %f447, %p105; mov.b32 %r272, %f74; setp.lt.s32 %p106, %r272, 0; mov.f32 %f451, 0f40490FDB; sub.f32 %f452, %f451, %f450; selp.f32 %f453, %f452, %f450, %p106; mov.b32 %r273, %f453; mov.b32 %r274, %f73; and.b32 %r275, %r274, -2147483648; or.b32 %r276, %r275, %r273; mov.b32 %f454, %r276; add.f32 %f455, %f75, %f76; setp.le.f32 %p107, %f455, 0f7F800000; selp.f32 %f865, %f454, %f455, %p107; $L__BB0_94: add.f32 %f456, %f865, %f865; setp.eq.f32 %p109, %f72, 0f00000000; selp.f32 %f457, 0f3F800000, 0f3F000000, %p109; fma.rn.f32 %f868, %f457, %f456, 0f00000000; $L__BB0_95: setp.lt.s32 %p110, %r3, 1; and.pred %p112, %p110, %p13; @%p112 bra $L__BB0_109; setp.ge.s32 %p113, %r49, %r124; and.pred %p115, %p113, %p88; @%p115 bra $L__BB0_109; @%p13 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: max.s32 %r500, %r18, 0; bra.uni $L__BB0_100; $L__BB0_98: rem.s32 %r288, %r18, %r126; add.s32 %r289, %r288, %r126; rem.s32 %r500, %r289, %r126; $L__BB0_100: @%p88 bra $L__BB0_102; bra.uni $L__BB0_101; $L__BB0_102: add.s32 %r292, %r124, -1; min.s32 %r501, %r49, %r292; bra.uni $L__BB0_103; $L__BB0_101: rem.s32 %r290, %r49, %r124; add.s32 %r291, %r290, %r124; rem.s32 %r501, %r291, %r124; $L__BB0_103: mad.lo.s32 %r293, %r500, %r125, %r2; mad.lo.s32 %r294, %r293, %r124, %r501; mul.wide.s32 %rd72, %r294, 4; add.s64 %rd73, %rd3, %rd72; add.s64 %rd74, %rd2, %rd72; add.s64 %rd75, %rd1, %rd72; ld.global.nc.f32 %f458, [%rd73]; ld.global.nc.f32 %f459, [%rd74]; mul.f32 %f460, %f459, %f459; fma.rn.f32 %f461, %f458, %f458, %f460; ld.global.nc.f32 %f462, [%rd75]; fma.rn.f32 %f83, %f462, %f462, %f461; mul.f32 %f463, %f65, %f67; mul.f32 %f464, %f64, %f68; sub.f32 %f465, %f464, %f463; mul.f32 %f466, %f63, %f68; mul.f32 %f467, %f65, %f66; sub.f32 %f468, %f467, %f466; mul.f32 %f469, %f64, %f66; mul.f32 %f470, %f63, %f67; sub.f32 %f471, %f470, %f469; mul.f32 %f472, %f2, %f468; fma.rn.f32 %f473, %f1, %f465, %f472; fma.rn.f32 %f84, %f3, %f471, %f473; mul.f32 %f474, %f2, %f64; fma.rn.f32 %f475, %f1, %f63, %f474; fma.rn.f32 %f476, %f3, %f65, %f475; add.f32 %f477, %f476, 0f3F800000; mul.f32 %f478, %f2, %f67; fma.rn.f32 %f479, %f1, %f66, %f478; fma.rn.f32 %f480, %f3, %f68, %f479; add.f32 %f481, %f477, %f480; mul.f32 %f482, %f64, %f67; fma.rn.f32 %f483, %f63, %f66, %f482; fma.rn.f32 %f484, %f65, %f68, %f483; add.f32 %f85, %f484, %f481; abs.f32 %f86, %f85; abs.f32 %f87, %f84; setp.eq.f32 %p118, %f86, 0f00000000; setp.eq.f32 %p119, %f87, 0f00000000; and.pred %p120, %p118, %p119; @%p120 bra $L__BB0_107; bra.uni $L__BB0_104; $L__BB0_107: mov.b32 %r305, %f85; shr.s32 %r306, %r305, 31; and.b32 %r307, %r306, 1078530011; mov.b32 %r308, %f84; and.b32 %r309, %r308, -2147483648; or.b32 %r310, %r309, %r307; mov.b32 %f867, %r310; bra.uni $L__BB0_108; $L__BB0_104: setp.eq.f32 %p121, %f86, 0f7F800000; setp.eq.f32 %p122, %f87, 0f7F800000; and.pred %p123, %p121, %p122; @%p123 bra $L__BB0_106; bra.uni $L__BB0_105; $L__BB0_106: mov.b32 %r300, %f85; setp.lt.s32 %p127, %r300, 0; selp.b32 %r301, 1075235812, 1061752795, %p127; mov.b32 %r302, %f84; and.b32 %r303, %r302, -2147483648; or.b32 %r304, %r303, %r301; mov.b32 %f867, %r304; bra.uni $L__BB0_108; $L__BB0_105: max.f32 %f485, %f87, %f86; min.f32 %f486, %f87, %f86; div.rn.f32 %f487, %f486, %f485; mul.rn.f32 %f488, %f487, %f487; mov.f32 %f489, 0fC0B59883; mov.f32 %f490, 0fBF52C7EA; fma.rn.f32 %f491, %f488, %f490, %f489; mov.f32 %f492, 0fC0D21907; fma.rn.f32 %f493, %f491, %f488, %f492; mul.f32 %f494, %f488, %f493; mul.f32 %f495, %f487, %f494; add.f32 %f496, %f488, 0f41355DC0; mov.f32 %f497, 0f41E6BD60; fma.rn.f32 %f498, %f496, %f488, %f497; mov.f32 %f499, 0f419D92C8; fma.rn.f32 %f500, %f498, %f488, %f499; rcp.rn.f32 %f501, %f500; fma.rn.f32 %f502, %f495, %f501, %f487; mov.f32 %f503, 0f3FC90FDB; sub.f32 %f504, %f503, %f502; setp.gt.f32 %p124, %f87, %f86; selp.f32 %f505, %f504, %f502, %p124; mov.b32 %r295, %f85; setp.lt.s32 %p125, %r295, 0; mov.f32 %f506, 0f40490FDB; sub.f32 %f507, %f506, %f505; selp.f32 %f508, %f507, %f505, %p125; mov.b32 %r296, %f508; mov.b32 %r297, %f84; and.b32 %r298, %r297, -2147483648; or.b32 %r299, %r298, %r296; mov.b32 %f509, %r299; add.f32 %f510, %f86, %f87; setp.le.f32 %p126, %f510, 0f7F800000; selp.f32 %f867, %f509, %f510, %p126; $L__BB0_108: add.f32 %f511, %f867, %f867; setp.eq.f32 %p128, %f83, 0f00000000; selp.f32 %f512, 0f3F800000, 0f3F000000, %p128; fma.rn.f32 %f868, %f512, %f511, %f868; $L__BB0_109: ld.param.u8 %rs6, [setemergentmagneticfieldsolidangle_param_13]; and.b16 %rs5, %rs6, 1; setp.ne.s16 %p129, %rs5, 0; setp.gt.s32 %p130, %r1, 0; or.pred %p3, %p130, %p129; and.pred %p131, %p3, %p2; not.pred %p132, %p131; @%p132 bra $L__BB0_122; @%p13 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: max.s32 %r502, %r18, 0; bra.uni $L__BB0_113; $L__BB0_111: rem.s32 %r311, %r18, %r126; add.s32 %r312, %r311, %r126; rem.s32 %r502, %r312, %r126; $L__BB0_113: @%p88 bra $L__BB0_115; bra.uni $L__BB0_114; $L__BB0_115: max.s32 %r503, %r56, 0; bra.uni $L__BB0_116; $L__BB0_114: rem.s32 %r313, %r56, %r124; add.s32 %r314, %r313, %r124; rem.s32 %r503, %r314, %r124; $L__BB0_116: mad.lo.s32 %r315, %r502, %r125, %r2; mad.lo.s32 %r316, %r315, %r124, %r503; mul.wide.s32 %rd76, %r316, 4; add.s64 %rd77, %rd3, %rd76; add.s64 %rd78, %rd2, %rd76; add.s64 %rd79, %rd1, %rd76; ld.global.nc.f32 %f513, [%rd77]; ld.global.nc.f32 %f514, [%rd78]; mul.f32 %f515, %f514, %f514; fma.rn.f32 %f516, %f513, %f513, %f515; ld.global.nc.f32 %f517, [%rd79]; fma.rn.f32 %f94, %f517, %f517, %f516; mul.f32 %f518, %f68, %f70; mul.f32 %f519, %f67, %f71; sub.f32 %f520, %f519, %f518; mul.f32 %f521, %f66, %f71; mul.f32 %f522, %f68, %f69; sub.f32 %f523, %f522, %f521; mul.f32 %f524, %f67, %f69; mul.f32 %f525, %f66, %f70; sub.f32 %f526, %f525, %f524; mul.f32 %f527, %f2, %f523; fma.rn.f32 %f528, %f1, %f520, %f527; fma.rn.f32 %f95, %f3, %f526, %f528; mul.f32 %f529, %f2, %f67; fma.rn.f32 %f530, %f1, %f66, %f529; fma.rn.f32 %f531, %f3, %f68, %f530; add.f32 %f532, %f531, 0f3F800000; mul.f32 %f533, %f2, %f70; fma.rn.f32 %f534, %f1, %f69, %f533; fma.rn.f32 %f535, %f3, %f71, %f534; add.f32 %f536, %f532, %f535; mul.f32 %f537, %f67, %f70; fma.rn.f32 %f538, %f66, %f69, %f537; fma.rn.f32 %f539, %f68, %f71, %f538; add.f32 %f96, %f539, %f536; abs.f32 %f97, %f96; abs.f32 %f98, %f95; setp.eq.f32 %p135, %f97, 0f00000000; setp.eq.f32 %p136, %f98, 0f00000000; and.pred %p137, %p135, %p136; @%p137 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: mov.b32 %r327, %f96; shr.s32 %r328, %r327, 31; and.b32 %r329, %r328, 1078530011; mov.b32 %r330, %f95; and.b32 %r331, %r330, -2147483648; or.b32 %r332, %r331, %r329; mov.b32 %f869, %r332; bra.uni $L__BB0_121; $L__BB0_117: setp.eq.f32 %p138, %f97, 0f7F800000; setp.eq.f32 %p139, %f98, 0f7F800000; and.pred %p140, %p138, %p139; @%p140 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: mov.b32 %r322, %f96; setp.lt.s32 %p144, %r322, 0; selp.b32 %r323, 1075235812, 1061752795, %p144; mov.b32 %r324, %f95; and.b32 %r325, %r324, -2147483648; or.b32 %r326, %r325, %r323; mov.b32 %f869, %r326; bra.uni $L__BB0_121; $L__BB0_118: max.f32 %f540, %f98, %f97; min.f32 %f541, %f98, %f97; div.rn.f32 %f542, %f541, %f540; mul.rn.f32 %f543, %f542, %f542; mov.f32 %f544, 0fC0B59883; mov.f32 %f545, 0fBF52C7EA; fma.rn.f32 %f546, %f543, %f545, %f544; mov.f32 %f547, 0fC0D21907; fma.rn.f32 %f548, %f546, %f543, %f547; mul.f32 %f549, %f543, %f548; mul.f32 %f550, %f542, %f549; add.f32 %f551, %f543, 0f41355DC0; mov.f32 %f552, 0f41E6BD60; fma.rn.f32 %f553, %f551, %f543, %f552; mov.f32 %f554, 0f419D92C8; fma.rn.f32 %f555, %f553, %f543, %f554; rcp.rn.f32 %f556, %f555; fma.rn.f32 %f557, %f550, %f556, %f542; mov.f32 %f558, 0f3FC90FDB; sub.f32 %f559, %f558, %f557; setp.gt.f32 %p141, %f98, %f97; selp.f32 %f560, %f559, %f557, %p141; mov.b32 %r317, %f96; setp.lt.s32 %p142, %r317, 0; mov.f32 %f561, 0f40490FDB; sub.f32 %f562, %f561, %f560; selp.f32 %f563, %f562, %f560, %p142; mov.b32 %r318, %f563; mov.b32 %r319, %f95; and.b32 %r320, %r319, -2147483648; or.b32 %r321, %r320, %r318; mov.b32 %f564, %r321; add.f32 %f565, %f97, %f98; setp.le.f32 %p143, %f565, 0f7F800000; selp.f32 %f869, %f564, %f565, %p143; $L__BB0_121: add.f32 %f566, %f869, %f869; setp.eq.f32 %p145, %f94, 0f00000000; selp.f32 %f567, 0f3F800000, 0f3F000000, %p145; fma.rn.f32 %f868, %f567, %f566, %f868; $L__BB0_122: setp.lt.s32 %p146, %r49, %r124; or.pred %p4, %p146, %p129; and.pred %p5, %p4, %p2; not.pred %p148, %p5; @%p148 bra $L__BB0_135; @%p13 bra $L__BB0_125; bra.uni $L__BB0_124; $L__BB0_125: add.s32 %r335, %r126, -1; min.s32 %r504, %r10, %r335; bra.uni $L__BB0_126; $L__BB0_124: rem.s32 %r333, %r10, %r126; add.s32 %r334, %r333, %r126; rem.s32 %r504, %r334, %r126; $L__BB0_126: mad.lo.s32 %r81, %r504, %r125, %r2; @%p88 bra $L__BB0_128; bra.uni $L__BB0_127; $L__BB0_128: max.s32 %r505, %r56, 0; bra.uni $L__BB0_129; $L__BB0_127: rem.s32 %r336, %r56, %r124; add.s32 %r337, %r336, %r124; rem.s32 %r505, %r337, %r124; $L__BB0_129: mad.lo.s32 %r338, %r81, %r124, %r505; mul.wide.s32 %rd80, %r338, 4; add.s64 %rd81, %rd3, %rd80; add.s64 %rd82, %rd2, %rd80; add.s64 %rd83, %rd1, %rd80; ld.global.nc.f32 %f568, [%rd81]; ld.global.nc.f32 %f569, [%rd82]; mul.f32 %f570, %f569, %f569; fma.rn.f32 %f571, %f568, %f568, %f570; ld.global.nc.f32 %f572, [%rd83]; fma.rn.f32 %f105, %f572, %f572, %f571; mul.f32 %f573, %f61, %f71; mul.f32 %f574, %f62, %f70; sub.f32 %f575, %f574, %f573; mul.f32 %f576, %f62, %f69; mul.f32 %f577, %f60, %f71; sub.f32 %f578, %f577, %f576; mul.f32 %f579, %f60, %f70; mul.f32 %f580, %f61, %f69; sub.f32 %f581, %f580, %f579; mul.f32 %f582, %f2, %f578; fma.rn.f32 %f583, %f1, %f575, %f582; fma.rn.f32 %f106, %f3, %f581, %f583; mul.f32 %f584, %f2, %f70; fma.rn.f32 %f585, %f1, %f69, %f584; fma.rn.f32 %f586, %f3, %f71, %f585; add.f32 %f587, %f586, 0f3F800000; mul.f32 %f588, %f2, %f61; fma.rn.f32 %f589, %f1, %f60, %f588; fma.rn.f32 %f590, %f3, %f62, %f589; add.f32 %f591, %f590, %f587; mul.f32 %f592, %f61, %f70; fma.rn.f32 %f593, %f60, %f69, %f592; fma.rn.f32 %f594, %f62, %f71, %f593; add.f32 %f107, %f594, %f591; abs.f32 %f108, %f107; abs.f32 %f109, %f106; setp.eq.f32 %p151, %f108, 0f00000000; setp.eq.f32 %p152, %f109, 0f00000000; and.pred %p153, %p151, %p152; @%p153 bra $L__BB0_133; bra.uni $L__BB0_130; $L__BB0_133: mov.b32 %r349, %f107; shr.s32 %r350, %r349, 31; and.b32 %r351, %r350, 1078530011; mov.b32 %r352, %f106; and.b32 %r353, %r352, -2147483648; or.b32 %r354, %r351, %r353; mov.b32 %f871, %r354; bra.uni $L__BB0_134; $L__BB0_130: setp.eq.f32 %p154, %f108, 0f7F800000; setp.eq.f32 %p155, %f109, 0f7F800000; and.pred %p156, %p154, %p155; @%p156 bra $L__BB0_132; bra.uni $L__BB0_131; $L__BB0_132: mov.b32 %r344, %f107; setp.lt.s32 %p160, %r344, 0; selp.b32 %r345, 1075235812, 1061752795, %p160; mov.b32 %r346, %f106; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r345, %r347; mov.b32 %f871, %r348; bra.uni $L__BB0_134; $L__BB0_131: max.f32 %f595, %f109, %f108; min.f32 %f596, %f109, %f108; div.rn.f32 %f597, %f596, %f595; mul.rn.f32 %f598, %f597, %f597; mov.f32 %f599, 0fC0B59883; mov.f32 %f600, 0fBF52C7EA; fma.rn.f32 %f601, %f598, %f600, %f599; mov.f32 %f602, 0fC0D21907; fma.rn.f32 %f603, %f601, %f598, %f602; mul.f32 %f604, %f598, %f603; mul.f32 %f605, %f597, %f604; add.f32 %f606, %f598, 0f41355DC0; mov.f32 %f607, 0f41E6BD60; fma.rn.f32 %f608, %f606, %f598, %f607; mov.f32 %f609, 0f419D92C8; fma.rn.f32 %f610, %f608, %f598, %f609; rcp.rn.f32 %f611, %f610; fma.rn.f32 %f612, %f605, %f611, %f597; mov.f32 %f613, 0f3FC90FDB; sub.f32 %f614, %f613, %f612; setp.gt.f32 %p157, %f109, %f108; selp.f32 %f615, %f614, %f612, %p157; mov.b32 %r339, %f107; setp.lt.s32 %p158, %r339, 0; mov.f32 %f616, 0f40490FDB; sub.f32 %f617, %f616, %f615; selp.f32 %f618, %f617, %f615, %p158; mov.b32 %r340, %f618; mov.b32 %r341, %f106; and.b32 %r342, %r341, -2147483648; or.b32 %r343, %r342, %r340; mov.b32 %f619, %r343; add.f32 %f620, %f108, %f109; setp.le.f32 %p159, %f620, 0f7F800000; selp.f32 %f871, %f619, %f620, %p159; $L__BB0_134: add.f32 %f621, %f871, %f871; setp.eq.f32 %p161, %f105, 0f00000000; selp.f32 %f622, 0f3F800000, 0f3F000000, %p161; fma.rn.f32 %f868, %f622, %f621, %f868; $L__BB0_135: @%p88 bra $L__BB0_137; bra.uni $L__BB0_136; $L__BB0_137: add.s32 %r357, %r124, -1; min.s32 %r506, %r49, %r357; bra.uni $L__BB0_138; $L__BB0_136: rem.s32 %r355, %r49, %r124; add.s32 %r356, %r355, %r124; rem.s32 %r506, %r356, %r124; $L__BB0_138: @%p12 bra $L__BB0_140; bra.uni $L__BB0_139; $L__BB0_140: add.s32 %r360, %r125, -1; min.s32 %r507, %r6, %r360; bra.uni $L__BB0_141; $L__BB0_139: rem.s32 %r358, %r6, %r125; add.s32 %r359, %r358, %r125; rem.s32 %r507, %r359, %r125; $L__BB0_141: @%p88 bra $L__BB0_143; bra.uni $L__BB0_142; $L__BB0_143: max.s32 %r508, %r56, 0; bra.uni $L__BB0_144; $L__BB0_142: rem.s32 %r361, %r56, %r124; add.s32 %r362, %r361, %r124; rem.s32 %r508, %r362, %r124; $L__BB0_144: add.s32 %r94, %r508, %r5; add.s32 %r95, %r506, %r5; @%p12 bra $L__BB0_146; bra.uni $L__BB0_145; $L__BB0_146: max.s32 %r509, %r14, 0; bra.uni $L__BB0_147; $L__BB0_145: rem.s32 %r363, %r14, %r125; add.s32 %r364, %r363, %r125; rem.s32 %r509, %r364, %r125; $L__BB0_147: add.s32 %r365, %r509, %r4; mad.lo.s32 %r366, %r365, %r124, %r1; mul.wide.s32 %rd84, %r95, 4; add.s64 %rd85, %rd3, %rd84; ld.global.nc.f32 %f116, [%rd85]; add.s64 %rd86, %rd2, %rd84; ld.global.nc.f32 %f117, [%rd86]; add.s64 %rd87, %rd1, %rd84; ld.global.nc.f32 %f118, [%rd87]; add.s32 %r367, %r507, %r4; mad.lo.s32 %r368, %r367, %r124, %r1; mul.wide.s32 %rd88, %r368, 4; add.s64 %rd89, %rd3, %rd88; ld.global.nc.f32 %f119, [%rd89]; add.s64 %rd90, %rd2, %rd88; ld.global.nc.f32 %f120, [%rd90]; add.s64 %rd91, %rd1, %rd88; ld.global.nc.f32 %f121, [%rd91]; mul.wide.s32 %rd92, %r94, 4; add.s64 %rd93, %rd3, %rd92; ld.global.nc.f32 %f122, [%rd93]; add.s64 %rd94, %rd2, %rd92; ld.global.nc.f32 %f123, [%rd94]; add.s64 %rd95, %rd1, %rd92; ld.global.nc.f32 %f124, [%rd95]; mul.wide.s32 %rd96, %r366, 4; add.s64 %rd97, %rd3, %rd96; ld.global.nc.f32 %f125, [%rd97]; add.s64 %rd98, %rd2, %rd96; ld.global.nc.f32 %f126, [%rd98]; add.s64 %rd99, %rd1, %rd96; ld.global.nc.f32 %f127, [%rd99]; and.pred %p166, %p4, %p1; mov.f32 %f878, 0f00000000; not.pred %p167, %p166; @%p167 bra $L__BB0_160; @%p12 bra $L__BB0_150; bra.uni $L__BB0_149; $L__BB0_150: add.s32 %r371, %r125, -1; min.s32 %r510, %r6, %r371; bra.uni $L__BB0_151; $L__BB0_149: rem.s32 %r369, %r6, %r125; add.s32 %r370, %r369, %r125; rem.s32 %r510, %r370, %r125; $L__BB0_151: @%p88 bra $L__BB0_153; bra.uni $L__BB0_152; $L__BB0_153: add.s32 %r374, %r124, -1; min.s32 %r511, %r49, %r374; bra.uni $L__BB0_154; $L__BB0_152: rem.s32 %r372, %r49, %r124; add.s32 %r373, %r372, %r124; rem.s32 %r511, %r373, %r124; $L__BB0_154: add.s32 %r375, %r510, %r4; mad.lo.s32 %r376, %r375, %r124, %r511; mul.wide.s32 %rd100, %r376, 4; add.s64 %rd101, %rd3, %rd100; add.s64 %rd102, %rd2, %rd100; add.s64 %rd103, %rd1, %rd100; ld.global.nc.f32 %f624, [%rd101]; ld.global.nc.f32 %f625, [%rd102]; mul.f32 %f626, %f625, %f625; fma.rn.f32 %f627, %f624, %f624, %f626; ld.global.nc.f32 %f628, [%rd103]; fma.rn.f32 %f128, %f628, %f628, %f627; mul.f32 %f629, %f118, %f120; mul.f32 %f630, %f117, %f121; sub.f32 %f631, %f630, %f629; mul.f32 %f632, %f116, %f121; mul.f32 %f633, %f118, %f119; sub.f32 %f634, %f633, %f632; mul.f32 %f635, %f117, %f119; mul.f32 %f636, %f116, %f120; sub.f32 %f637, %f636, %f635; mul.f32 %f638, %f2, %f634; fma.rn.f32 %f639, %f1, %f631, %f638; fma.rn.f32 %f129, %f3, %f637, %f639; mul.f32 %f640, %f2, %f117; fma.rn.f32 %f641, %f1, %f116, %f640; fma.rn.f32 %f642, %f3, %f118, %f641; add.f32 %f643, %f642, 0f3F800000; mul.f32 %f644, %f2, %f120; fma.rn.f32 %f645, %f1, %f119, %f644; fma.rn.f32 %f646, %f3, %f121, %f645; add.f32 %f647, %f643, %f646; mul.f32 %f648, %f117, %f120; fma.rn.f32 %f649, %f116, %f119, %f648; fma.rn.f32 %f650, %f118, %f121, %f649; add.f32 %f130, %f650, %f647; abs.f32 %f131, %f130; abs.f32 %f132, %f129; setp.eq.f32 %p170, %f131, 0f00000000; setp.eq.f32 %p171, %f132, 0f00000000; and.pred %p172, %p170, %p171; @%p172 bra $L__BB0_158; bra.uni $L__BB0_155; $L__BB0_158: mov.b32 %r387, %f130; shr.s32 %r388, %r387, 31; and.b32 %r389, %r388, 1078530011; mov.b32 %r390, %f129; and.b32 %r391, %r390, -2147483648; or.b32 %r392, %r391, %r389; mov.b32 %f873, %r392; bra.uni $L__BB0_159; $L__BB0_155: setp.eq.f32 %p173, %f131, 0f7F800000; setp.eq.f32 %p174, %f132, 0f7F800000; and.pred %p175, %p173, %p174; @%p175 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: mov.b32 %r382, %f130; setp.lt.s32 %p179, %r382, 0; selp.b32 %r383, 1075235812, 1061752795, %p179; mov.b32 %r384, %f129; and.b32 %r385, %r384, -2147483648; or.b32 %r386, %r385, %r383; mov.b32 %f873, %r386; bra.uni $L__BB0_159; $L__BB0_156: max.f32 %f651, %f132, %f131; min.f32 %f652, %f132, %f131; div.rn.f32 %f653, %f652, %f651; mul.rn.f32 %f654, %f653, %f653; mov.f32 %f655, 0fC0B59883; mov.f32 %f656, 0fBF52C7EA; fma.rn.f32 %f657, %f654, %f656, %f655; mov.f32 %f658, 0fC0D21907; fma.rn.f32 %f659, %f657, %f654, %f658; mul.f32 %f660, %f654, %f659; mul.f32 %f661, %f653, %f660; add.f32 %f662, %f654, 0f41355DC0; mov.f32 %f663, 0f41E6BD60; fma.rn.f32 %f664, %f662, %f654, %f663; mov.f32 %f665, 0f419D92C8; fma.rn.f32 %f666, %f664, %f654, %f665; rcp.rn.f32 %f667, %f666; fma.rn.f32 %f668, %f661, %f667, %f653; mov.f32 %f669, 0f3FC90FDB; sub.f32 %f670, %f669, %f668; setp.gt.f32 %p176, %f132, %f131; selp.f32 %f671, %f670, %f668, %p176; mov.b32 %r377, %f130; setp.lt.s32 %p177, %r377, 0; mov.f32 %f672, 0f40490FDB; sub.f32 %f673, %f672, %f671; selp.f32 %f674, %f673, %f671, %p177; mov.b32 %r378, %f674; mov.b32 %r379, %f129; and.b32 %r380, %r379, -2147483648; or.b32 %r381, %r380, %r378; mov.b32 %f675, %r381; add.f32 %f676, %f131, %f132; setp.le.f32 %p178, %f676, 0f7F800000; selp.f32 %f873, %f675, %f676, %p178; $L__BB0_159: add.f32 %f677, %f873, %f873; setp.eq.f32 %p180, %f128, 0f00000000; selp.f32 %f678, 0f3F800000, 0f3F000000, %p180; fma.rn.f32 %f878, %f678, %f677, 0f00000000; $L__BB0_160: not.pred %p181, %p3; @%p181 bra $L__BB0_187; @%p18 bra $L__BB0_174; @%p12 bra $L__BB0_164; bra.uni $L__BB0_163; $L__BB0_164: add.s32 %r395, %r125, -1; min.s32 %r512, %r6, %r395; bra.uni $L__BB0_165; $L__BB0_163: rem.s32 %r393, %r6, %r125; add.s32 %r394, %r393, %r125; rem.s32 %r512, %r394, %r125; $L__BB0_165: @%p88 bra $L__BB0_167; bra.uni $L__BB0_166; $L__BB0_167: max.s32 %r513, %r56, 0; bra.uni $L__BB0_168; $L__BB0_166: rem.s32 %r396, %r56, %r124; add.s32 %r397, %r396, %r124; rem.s32 %r513, %r397, %r124; $L__BB0_168: add.s32 %r398, %r512, %r4; mad.lo.s32 %r399, %r398, %r124, %r513; mul.wide.s32 %rd104, %r399, 4; add.s64 %rd105, %rd3, %rd104; add.s64 %rd106, %rd2, %rd104; add.s64 %rd107, %rd1, %rd104; ld.global.nc.f32 %f679, [%rd105]; ld.global.nc.f32 %f680, [%rd106]; mul.f32 %f681, %f680, %f680; fma.rn.f32 %f682, %f679, %f679, %f681; ld.global.nc.f32 %f683, [%rd107]; fma.rn.f32 %f139, %f683, %f683, %f682; mul.f32 %f684, %f121, %f123; mul.f32 %f685, %f120, %f124; sub.f32 %f686, %f685, %f684; mul.f32 %f687, %f119, %f124; mul.f32 %f688, %f121, %f122; sub.f32 %f689, %f688, %f687; mul.f32 %f690, %f120, %f122; mul.f32 %f691, %f119, %f123; sub.f32 %f692, %f691, %f690; mul.f32 %f693, %f2, %f689; fma.rn.f32 %f694, %f1, %f686, %f693; fma.rn.f32 %f140, %f3, %f692, %f694; mul.f32 %f695, %f2, %f120; fma.rn.f32 %f696, %f1, %f119, %f695; fma.rn.f32 %f697, %f3, %f121, %f696; add.f32 %f698, %f697, 0f3F800000; mul.f32 %f699, %f2, %f123; fma.rn.f32 %f700, %f1, %f122, %f699; fma.rn.f32 %f701, %f3, %f124, %f700; add.f32 %f702, %f698, %f701; mul.f32 %f703, %f120, %f123; fma.rn.f32 %f704, %f119, %f122, %f703; fma.rn.f32 %f705, %f121, %f124, %f704; add.f32 %f141, %f705, %f702; abs.f32 %f142, %f141; abs.f32 %f143, %f140; setp.eq.f32 %p185, %f142, 0f00000000; setp.eq.f32 %p186, %f143, 0f00000000; and.pred %p187, %p185, %p186; @%p187 bra $L__BB0_172; bra.uni $L__BB0_169; $L__BB0_172: mov.b32 %r410, %f141; shr.s32 %r411, %r410, 31; and.b32 %r412, %r411, 1078530011; mov.b32 %r413, %f140; and.b32 %r414, %r413, -2147483648; or.b32 %r415, %r414, %r412; mov.b32 %f875, %r415; bra.uni $L__BB0_173; $L__BB0_169: setp.eq.f32 %p188, %f142, 0f7F800000; setp.eq.f32 %p189, %f143, 0f7F800000; and.pred %p190, %p188, %p189; @%p190 bra $L__BB0_171; bra.uni $L__BB0_170; $L__BB0_171: mov.b32 %r405, %f141; setp.lt.s32 %p194, %r405, 0; selp.b32 %r406, 1075235812, 1061752795, %p194; mov.b32 %r407, %f140; and.b32 %r408, %r407, -2147483648; or.b32 %r409, %r408, %r406; mov.b32 %f875, %r409; bra.uni $L__BB0_173; $L__BB0_170: max.f32 %f706, %f143, %f142; min.f32 %f707, %f143, %f142; div.rn.f32 %f708, %f707, %f706; mul.rn.f32 %f709, %f708, %f708; mov.f32 %f710, 0fC0B59883; mov.f32 %f711, 0fBF52C7EA; fma.rn.f32 %f712, %f709, %f711, %f710; mov.f32 %f713, 0fC0D21907; fma.rn.f32 %f714, %f712, %f709, %f713; mul.f32 %f715, %f709, %f714; mul.f32 %f716, %f708, %f715; add.f32 %f717, %f709, 0f41355DC0; mov.f32 %f718, 0f41E6BD60; fma.rn.f32 %f719, %f717, %f709, %f718; mov.f32 %f720, 0f419D92C8; fma.rn.f32 %f721, %f719, %f709, %f720; rcp.rn.f32 %f722, %f721; fma.rn.f32 %f723, %f716, %f722, %f708; mov.f32 %f724, 0f3FC90FDB; sub.f32 %f725, %f724, %f723; setp.gt.f32 %p191, %f143, %f142; selp.f32 %f726, %f725, %f723, %p191; mov.b32 %r400, %f141; setp.lt.s32 %p192, %r400, 0; mov.f32 %f727, 0f40490FDB; sub.f32 %f728, %f727, %f726; selp.f32 %f729, %f728, %f726, %p192; mov.b32 %r401, %f729; mov.b32 %r402, %f140; and.b32 %r403, %r402, -2147483648; or.b32 %r404, %r403, %r401; mov.b32 %f730, %r404; add.f32 %f731, %f142, %f143; setp.le.f32 %p193, %f731, 0f7F800000; selp.f32 %f875, %f730, %f731, %p193; $L__BB0_173: add.f32 %f732, %f875, %f875; setp.eq.f32 %p195, %f139, 0f00000000; selp.f32 %f733, 0f3F800000, 0f3F000000, %p195; fma.rn.f32 %f878, %f733, %f732, %f878; $L__BB0_174: not.pred %p224, %p2; @%p224 bra $L__BB0_187; @%p12 bra $L__BB0_177; bra.uni $L__BB0_176; $L__BB0_177: max.s32 %r514, %r14, 0; bra.uni $L__BB0_178; $L__BB0_176: rem.s32 %r416, %r14, %r125; add.s32 %r417, %r416, %r125; rem.s32 %r514, %r417, %r125; $L__BB0_178: @%p88 bra $L__BB0_180; bra.uni $L__BB0_179; $L__BB0_180: max.s32 %r515, %r56, 0; bra.uni $L__BB0_181; $L__BB0_179: rem.s32 %r418, %r56, %r124; add.s32 %r419, %r418, %r124; rem.s32 %r515, %r419, %r124; $L__BB0_181: add.s32 %r420, %r514, %r4; mad.lo.s32 %r421, %r420, %r124, %r515; mul.wide.s32 %rd108, %r421, 4; add.s64 %rd109, %rd3, %rd108; add.s64 %rd110, %rd2, %rd108; add.s64 %rd111, %rd1, %rd108; ld.global.nc.f32 %f734, [%rd109]; ld.global.nc.f32 %f735, [%rd110]; mul.f32 %f736, %f735, %f735; fma.rn.f32 %f737, %f734, %f734, %f736; ld.global.nc.f32 %f738, [%rd111]; fma.rn.f32 %f150, %f738, %f738, %f737; mul.f32 %f739, %f124, %f126; mul.f32 %f740, %f123, %f127; sub.f32 %f741, %f740, %f739; mul.f32 %f742, %f122, %f127; mul.f32 %f743, %f124, %f125; sub.f32 %f744, %f743, %f742; mul.f32 %f745, %f123, %f125; mul.f32 %f746, %f122, %f126; sub.f32 %f747, %f746, %f745; mul.f32 %f748, %f2, %f744; fma.rn.f32 %f749, %f1, %f741, %f748; fma.rn.f32 %f151, %f3, %f747, %f749; mul.f32 %f750, %f2, %f123; fma.rn.f32 %f751, %f1, %f122, %f750; fma.rn.f32 %f752, %f3, %f124, %f751; add.f32 %f753, %f752, 0f3F800000; mul.f32 %f754, %f2, %f126; fma.rn.f32 %f755, %f1, %f125, %f754; fma.rn.f32 %f756, %f3, %f127, %f755; add.f32 %f757, %f753, %f756; mul.f32 %f758, %f123, %f126; fma.rn.f32 %f759, %f122, %f125, %f758; fma.rn.f32 %f760, %f124, %f127, %f759; add.f32 %f152, %f760, %f757; abs.f32 %f153, %f152; abs.f32 %f154, %f151; setp.eq.f32 %p199, %f153, 0f00000000; setp.eq.f32 %p200, %f154, 0f00000000; and.pred %p201, %p199, %p200; @%p201 bra $L__BB0_185; bra.uni $L__BB0_182; $L__BB0_185: mov.b32 %r432, %f152; shr.s32 %r433, %r432, 31; and.b32 %r434, %r433, 1078530011; mov.b32 %r435, %f151; and.b32 %r436, %r435, -2147483648; or.b32 %r437, %r436, %r434; mov.b32 %f877, %r437; bra.uni $L__BB0_186; $L__BB0_182: setp.eq.f32 %p202, %f153, 0f7F800000; setp.eq.f32 %p203, %f154, 0f7F800000; and.pred %p204, %p202, %p203; @%p204 bra $L__BB0_184; bra.uni $L__BB0_183; $L__BB0_184: mov.b32 %r427, %f152; setp.lt.s32 %p208, %r427, 0; selp.b32 %r428, 1075235812, 1061752795, %p208; mov.b32 %r429, %f151; and.b32 %r430, %r429, -2147483648; or.b32 %r431, %r430, %r428; mov.b32 %f877, %r431; bra.uni $L__BB0_186; $L__BB0_183: max.f32 %f761, %f154, %f153; min.f32 %f762, %f154, %f153; div.rn.f32 %f763, %f762, %f761; mul.rn.f32 %f764, %f763, %f763; mov.f32 %f765, 0fC0B59883; mov.f32 %f766, 0fBF52C7EA; fma.rn.f32 %f767, %f764, %f766, %f765; mov.f32 %f768, 0fC0D21907; fma.rn.f32 %f769, %f767, %f764, %f768; mul.f32 %f770, %f764, %f769; mul.f32 %f771, %f763, %f770; add.f32 %f772, %f764, 0f41355DC0; mov.f32 %f773, 0f41E6BD60; fma.rn.f32 %f774, %f772, %f764, %f773; mov.f32 %f775, 0f419D92C8; fma.rn.f32 %f776, %f774, %f764, %f775; rcp.rn.f32 %f777, %f776; fma.rn.f32 %f778, %f771, %f777, %f763; mov.f32 %f779, 0f3FC90FDB; sub.f32 %f780, %f779, %f778; setp.gt.f32 %p205, %f154, %f153; selp.f32 %f781, %f780, %f778, %p205; mov.b32 %r422, %f152; setp.lt.s32 %p206, %r422, 0; mov.f32 %f782, 0f40490FDB; sub.f32 %f783, %f782, %f781; selp.f32 %f784, %f783, %f781, %p206; mov.b32 %r423, %f784; mov.b32 %r424, %f151; and.b32 %r425, %r424, -2147483648; or.b32 %r426, %r425, %r423; mov.b32 %f785, %r426; add.f32 %f786, %f153, %f154; setp.le.f32 %p207, %f786, 0f7F800000; selp.f32 %f877, %f785, %f786, %p207; $L__BB0_186: add.f32 %f787, %f877, %f877; setp.eq.f32 %p209, %f150, 0f00000000; selp.f32 %f788, 0f3F800000, 0f3F000000, %p209; fma.rn.f32 %f878, %f788, %f787, %f878; $L__BB0_187: @%p148 bra $L__BB0_200; @%p12 bra $L__BB0_190; bra.uni $L__BB0_189; $L__BB0_190: max.s32 %r516, %r14, 0; bra.uni $L__BB0_191; $L__BB0_189: rem.s32 %r438, %r14, %r125; add.s32 %r439, %r438, %r125; rem.s32 %r516, %r439, %r125; $L__BB0_191: add.s32 %r120, %r516, %r4; @%p88 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: add.s32 %r442, %r124, -1; min.s32 %r517, %r49, %r442; bra.uni $L__BB0_194; $L__BB0_192: rem.s32 %r440, %r49, %r124; add.s32 %r441, %r440, %r124; rem.s32 %r517, %r441, %r124; $L__BB0_194: mad.lo.s32 %r443, %r120, %r124, %r517; mul.wide.s32 %rd112, %r443, 4; add.s64 %rd113, %rd3, %rd112; add.s64 %rd114, %rd2, %rd112; add.s64 %rd115, %rd1, %rd112; ld.global.nc.f32 %f789, [%rd113]; ld.global.nc.f32 %f790, [%rd114]; mul.f32 %f791, %f790, %f790; fma.rn.f32 %f792, %f789, %f789, %f791; ld.global.nc.f32 %f793, [%rd115]; fma.rn.f32 %f161, %f793, %f793, %f792; mul.f32 %f794, %f117, %f127; mul.f32 %f795, %f118, %f126; sub.f32 %f796, %f795, %f794; mul.f32 %f797, %f118, %f125; mul.f32 %f798, %f116, %f127; sub.f32 %f799, %f798, %f797; mul.f32 %f800, %f116, %f126; mul.f32 %f801, %f117, %f125; sub.f32 %f802, %f801, %f800; mul.f32 %f803, %f2, %f799; fma.rn.f32 %f804, %f1, %f796, %f803; fma.rn.f32 %f162, %f3, %f802, %f804; mul.f32 %f805, %f2, %f126; fma.rn.f32 %f806, %f1, %f125, %f805; fma.rn.f32 %f807, %f3, %f127, %f806; add.f32 %f808, %f807, 0f3F800000; mul.f32 %f809, %f2, %f117; fma.rn.f32 %f810, %f1, %f116, %f809; fma.rn.f32 %f811, %f3, %f118, %f810; add.f32 %f812, %f811, %f808; mul.f32 %f813, %f117, %f126; fma.rn.f32 %f814, %f116, %f125, %f813; fma.rn.f32 %f815, %f118, %f127, %f814; add.f32 %f163, %f815, %f812; abs.f32 %f164, %f163; abs.f32 %f165, %f162; setp.eq.f32 %p213, %f164, 0f00000000; setp.eq.f32 %p214, %f165, 0f00000000; and.pred %p215, %p213, %p214; @%p215 bra $L__BB0_198; bra.uni $L__BB0_195; $L__BB0_198: mov.b32 %r454, %f163; shr.s32 %r455, %r454, 31; and.b32 %r456, %r455, 1078530011; mov.b32 %r457, %f162; and.b32 %r458, %r457, -2147483648; or.b32 %r459, %r456, %r458; mov.b32 %f879, %r459; bra.uni $L__BB0_199; $L__BB0_195: setp.eq.f32 %p216, %f164, 0f7F800000; setp.eq.f32 %p217, %f165, 0f7F800000; and.pred %p218, %p216, %p217; @%p218 bra $L__BB0_197; bra.uni $L__BB0_196; $L__BB0_197: mov.b32 %r449, %f163; setp.lt.s32 %p222, %r449, 0; selp.b32 %r450, 1075235812, 1061752795, %p222; mov.b32 %r451, %f162; and.b32 %r452, %r451, -2147483648; or.b32 %r453, %r450, %r452; mov.b32 %f879, %r453; bra.uni $L__BB0_199; $L__BB0_196: max.f32 %f816, %f165, %f164; min.f32 %f817, %f165, %f164; div.rn.f32 %f818, %f817, %f816; mul.rn.f32 %f819, %f818, %f818; mov.f32 %f820, 0fC0B59883; mov.f32 %f821, 0fBF52C7EA; fma.rn.f32 %f822, %f819, %f821, %f820; mov.f32 %f823, 0fC0D21907; fma.rn.f32 %f824, %f822, %f819, %f823; mul.f32 %f825, %f819, %f824; mul.f32 %f826, %f818, %f825; add.f32 %f827, %f819, 0f41355DC0; mov.f32 %f828, 0f41E6BD60; fma.rn.f32 %f829, %f827, %f819, %f828; mov.f32 %f830, 0f419D92C8; fma.rn.f32 %f831, %f829, %f819, %f830; rcp.rn.f32 %f832, %f831; fma.rn.f32 %f833, %f826, %f832, %f818; mov.f32 %f834, 0f3FC90FDB; sub.f32 %f835, %f834, %f833; setp.gt.f32 %p219, %f165, %f164; selp.f32 %f836, %f835, %f833, %p219; mov.b32 %r444, %f163; setp.lt.s32 %p220, %r444, 0; mov.f32 %f837, 0f40490FDB; sub.f32 %f838, %f837, %f836; selp.f32 %f839, %f838, %f836, %p220; mov.b32 %r445, %f839; mov.b32 %r446, %f162; and.b32 %r447, %r446, -2147483648; or.b32 %r448, %r447, %r445; mov.b32 %f840, %r448; add.f32 %f841, %f164, %f165; setp.le.f32 %p221, %f841, 0f7F800000; selp.f32 %f879, %f840, %f841, %p221; $L__BB0_199: add.f32 %f842, %f879, %f879; setp.eq.f32 %p223, %f161, 0f00000000; selp.f32 %f843, 0f3F800000, 0f3F000000, %p223; fma.rn.f32 %f878, %f843, %f842, %f878; $L__BB0_200: ld.param.u64 %rd123, [setemergentmagneticfieldsolidangle_param_1]; mov.u32 %r481, %tid.x; mov.u32 %r480, %ntid.x; mov.u32 %r479, %ctaid.x; mad.lo.s32 %r478, %r479, %r480, %r481; add.s32 %r477, %r5, %r478; mul.wide.s32 %rd122, %r477, 4; cvta.to.global.u64 %rd121, %rd123; add.s64 %rd120, %rd121, %rd122; ld.param.f32 %f856, [setemergentmagneticfieldsolidangle_param_9]; ld.param.f32 %f855, [setemergentmagneticfieldsolidangle_param_8]; ld.param.u64 %rd119, [setemergentmagneticfieldsolidangle_param_0]; mov.u32 %r476, %tid.z; mov.u32 %r475, %ntid.z; mov.u32 %r474, %ctaid.z; mad.lo.s32 %r473, %r474, %r475, %r476; mov.u32 %r472, %tid.y; mov.u32 %r471, %ntid.y; mov.u32 %r470, %ctaid.y; mad.lo.s32 %r469, %r470, %r471, %r472; mul.lo.s32 %r468, %r473, %r125; add.s32 %r467, %r468, %r469; mov.u32 %r466, %tid.x; mov.u32 %r465, %ntid.x; mov.u32 %r464, %ctaid.x; mad.lo.s32 %r463, %r464, %r465, %r466; mul.lo.s32 %r462, %r467, %r124; add.s32 %r461, %r462, %r463; mul.wide.s32 %rd118, %r461, 4; cvta.to.global.u64 %rd117, %rd119; add.s64 %rd116, %rd117, %rd118; ld.param.f32 %f854, [setemergentmagneticfieldsolidangle_param_7]; ld.param.f32 %f853, [setemergentmagneticfieldsolidangle_param_6]; add.f32 %f844, %f853, %f853; mul.f32 %f845, %f844, %f854; mul.f32 %f846, %f845, %f862; st.global.f32 [%rd116], %f846; mul.f32 %f847, %f844, %f855; mul.f32 %f848, %f847, %f868; st.global.f32 [%rd120], %f848; mul.f32 %f849, %f844, %f856; mul.f32 %f850, %f849, %f878; st.global.f32 [%rd6], %f850; $L__BB0_202: ret; } ` ) 3-3.11.1/cuda/hopf-emergentmagneticfieldfivepoint.cu000066400000000000000000000200241503346766200224160ustar00rootroot00000000000000#include #include "float3.h" #include "stencil.h" // Sets the emergent magnetic field F_i = (1/8π) ε_{ijk} m · (∂m/∂x_j × ∂m/∂x_k) // See hopfindex-five-point.go extern "C" __global__ void setemergentmagneticfieldfivepoint(float* __restrict__ Fx, float* __restrict__ Fy, float* __restrict__ Fz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float prefactor, float icycz, float iczcx, float icxcy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[I], my[I], mz[I]); // +0 float3 dmdx = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂x float3 dmdy = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdz = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdy_x_dmdz = make_float3(0.0, 0.0, 0.0); // ∂m/∂y × ∂m/∂z float3 dmdz_x_dmdx = make_float3(0.0, 0.0, 0.0); // ∂m/∂z × ∂m/∂x float3 dmdx_x_dmdy = make_float3(0.0, 0.0, 0.0); // ∂m/∂x × ∂m/∂y int i_; // neighbor index if(is0(m0)) { Fx[I] = 0.0f; Fy[I] = 0.0f; Fz[I] = 0.0f; return; } // x derivatives (along length) { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); // -2 i_ = idx(lclampx(ix-2), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-2 >= 0 || PBCx) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); // -1 i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); // +1 i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); // +2 i_ = idx(hclampx(ix+2), iy, iz); if (ix+2 < Nx || PBCx) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdx = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdx = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdx = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdx = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdx = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdx = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdx = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // y derivatives (along height) { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-2), iz); if (iy-2 >= 0 || PBCy) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+2), iz); if (iy+2 < Ny || PBCy) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdy = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdy = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdy = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdy = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdy = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdy = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdy = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // z derivatives (along depth) { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-2)); if (iz-2 >= 0 || PBCz) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-1)); if (iz-1 >= 0 || PBCz) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+1)); if (iz+1 < Nz || PBCz) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+2)); if (iz+2 < Nz || PBCz) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdz = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdz = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdz = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdz = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdz = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdz = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdz = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } dmdy_x_dmdz = cross(dmdy, dmdz); dmdz_x_dmdx = cross(dmdz, dmdx); dmdx_x_dmdy = cross(dmdx, dmdy); Fx[I] = 2 * prefactor * icycz * dot(m0, dmdy_x_dmdz); Fy[I] = 2 * prefactor * iczcx * dot(m0, dmdz_x_dmdx); Fz[I] = 2 * prefactor * icxcy * dot(m0, dmdx_x_dmdy); } 3-3.11.1/cuda/hopf-emergentmagneticfieldfivepoint_wrapper.go000066400000000000000000012503411503346766200241640ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setemergentmagneticfieldfivepoint kernel var setemergentmagneticfieldfivepoint_code cu.Function // Stores the arguments for setemergentmagneticfieldfivepoint kernel invocation type setemergentmagneticfieldfivepoint_args_t struct { arg_Fx unsafe.Pointer arg_Fy unsafe.Pointer arg_Fz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_prefactor float32 arg_icycz float32 arg_iczcx float32 arg_icxcy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [14]unsafe.Pointer sync.Mutex } // Stores the arguments for setemergentmagneticfieldfivepoint kernel invocation var setemergentmagneticfieldfivepoint_args setemergentmagneticfieldfivepoint_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setemergentmagneticfieldfivepoint_args.argptr[0] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_Fx) setemergentmagneticfieldfivepoint_args.argptr[1] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_Fy) setemergentmagneticfieldfivepoint_args.argptr[2] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_Fz) setemergentmagneticfieldfivepoint_args.argptr[3] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_mx) setemergentmagneticfieldfivepoint_args.argptr[4] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_my) setemergentmagneticfieldfivepoint_args.argptr[5] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_mz) setemergentmagneticfieldfivepoint_args.argptr[6] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_prefactor) setemergentmagneticfieldfivepoint_args.argptr[7] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_icycz) setemergentmagneticfieldfivepoint_args.argptr[8] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_iczcx) setemergentmagneticfieldfivepoint_args.argptr[9] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_icxcy) setemergentmagneticfieldfivepoint_args.argptr[10] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_Nx) setemergentmagneticfieldfivepoint_args.argptr[11] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_Ny) setemergentmagneticfieldfivepoint_args.argptr[12] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_Nz) setemergentmagneticfieldfivepoint_args.argptr[13] = unsafe.Pointer(&setemergentmagneticfieldfivepoint_args.arg_PBC) } // Wrapper for setemergentmagneticfieldfivepoint CUDA kernel, asynchronous. func k_setemergentmagneticfieldfivepoint_async(Fx unsafe.Pointer, Fy unsafe.Pointer, Fz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, prefactor float32, icycz float32, iczcx float32, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("setemergentmagneticfieldfivepoint") } setemergentmagneticfieldfivepoint_args.Lock() defer setemergentmagneticfieldfivepoint_args.Unlock() if setemergentmagneticfieldfivepoint_code == 0 { setemergentmagneticfieldfivepoint_code = fatbinLoad(setemergentmagneticfieldfivepoint_map, "setemergentmagneticfieldfivepoint") } setemergentmagneticfieldfivepoint_args.arg_Fx = Fx setemergentmagneticfieldfivepoint_args.arg_Fy = Fy setemergentmagneticfieldfivepoint_args.arg_Fz = Fz setemergentmagneticfieldfivepoint_args.arg_mx = mx setemergentmagneticfieldfivepoint_args.arg_my = my setemergentmagneticfieldfivepoint_args.arg_mz = mz setemergentmagneticfieldfivepoint_args.arg_prefactor = prefactor setemergentmagneticfieldfivepoint_args.arg_icycz = icycz setemergentmagneticfieldfivepoint_args.arg_iczcx = iczcx setemergentmagneticfieldfivepoint_args.arg_icxcy = icxcy setemergentmagneticfieldfivepoint_args.arg_Nx = Nx setemergentmagneticfieldfivepoint_args.arg_Ny = Ny setemergentmagneticfieldfivepoint_args.arg_Nz = Nz setemergentmagneticfieldfivepoint_args.arg_PBC = PBC args := setemergentmagneticfieldfivepoint_args.argptr[:] cu.LaunchKernel(setemergentmagneticfieldfivepoint_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setemergentmagneticfieldfivepoint") } } // maps compute capability on PTX code for setemergentmagneticfieldfivepoint kernel. var setemergentmagneticfieldfivepoint_map = map[int]string{0: "", 50: setemergentmagneticfieldfivepoint_ptx_50, 52: setemergentmagneticfieldfivepoint_ptx_52, 53: setemergentmagneticfieldfivepoint_ptx_53, 60: setemergentmagneticfieldfivepoint_ptx_60, 61: setemergentmagneticfieldfivepoint_ptx_61, 62: setemergentmagneticfieldfivepoint_ptx_62, 70: setemergentmagneticfieldfivepoint_ptx_70, 72: setemergentmagneticfieldfivepoint_ptx_72, 75: setemergentmagneticfieldfivepoint_ptx_75, 80: setemergentmagneticfieldfivepoint_ptx_80, 86: setemergentmagneticfieldfivepoint_ptx_86, 87: setemergentmagneticfieldfivepoint_ptx_87, 89: setemergentmagneticfieldfivepoint_ptx_89, 90: setemergentmagneticfieldfivepoint_ptx_90} // setemergentmagneticfieldfivepoint PTX code for various compute capabilities. const ( setemergentmagneticfieldfivepoint_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` setemergentmagneticfieldfivepoint_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl setemergentmagneticfieldfivepoint .visible .entry setemergentmagneticfieldfivepoint( .param .u64 setemergentmagneticfieldfivepoint_param_0, .param .u64 setemergentmagneticfieldfivepoint_param_1, .param .u64 setemergentmagneticfieldfivepoint_param_2, .param .u64 setemergentmagneticfieldfivepoint_param_3, .param .u64 setemergentmagneticfieldfivepoint_param_4, .param .u64 setemergentmagneticfieldfivepoint_param_5, .param .f32 setemergentmagneticfieldfivepoint_param_6, .param .f32 setemergentmagneticfieldfivepoint_param_7, .param .f32 setemergentmagneticfieldfivepoint_param_8, .param .f32 setemergentmagneticfieldfivepoint_param_9, .param .u32 setemergentmagneticfieldfivepoint_param_10, .param .u32 setemergentmagneticfieldfivepoint_param_11, .param .u32 setemergentmagneticfieldfivepoint_param_12, .param .u8 setemergentmagneticfieldfivepoint_param_13 ) { .reg .pred %p<115>; .reg .b16 %rs<5>; .reg .f32 %f<495>; .reg .b32 %r<131>; .reg .b64 %rd<68>; ld.param.u8 %rs4, [setemergentmagneticfieldfivepoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldfivepoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldfivepoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldfivepoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldfivepoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldfivepoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldfivepoint_param_5]; ld.param.f32 %f205, [setemergentmagneticfieldfivepoint_param_6]; ld.param.f32 %f206, [setemergentmagneticfieldfivepoint_param_7]; ld.param.f32 %f207, [setemergentmagneticfieldfivepoint_param_8]; ld.param.f32 %f208, [setemergentmagneticfieldfivepoint_param_9]; ld.param.u32 %r57, [setemergentmagneticfieldfivepoint_param_10]; ld.param.u32 %r58, [setemergentmagneticfieldfivepoint_param_11]; ld.param.u32 %r59, [setemergentmagneticfieldfivepoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_106; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f209, %f2, %f2; fma.rn.f32 %f210, %f1, %f1, %f209; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f211, %f3, %f3, %f210; setp.eq.f32 %p6, %f211, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_105; bra.uni $L__BB0_2; $L__BB0_105: mov.u32 %r118, 0; st.global.u32 [%rd4], %r118; st.global.u32 [%rd5], %r118; st.global.u32 [%rd6], %r118; bra.uni $L__BB0_106; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r119, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r119, %r72, %r57; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r73, %r119, %r5; mul.wide.s32 %rd20, %r73, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f9, [%rd23]; ld.global.nc.f32 %f8, [%rd22]; ld.global.nc.f32 %f7, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r120, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r120, %r75, %r57; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r76, %r120, %r5; mul.wide.s32 %rd24, %r76, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f38, [%rd27]; ld.global.nc.f32 %f39, [%rd26]; ld.global.nc.f32 %f40, [%rd25]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r79, %r57, -1; min.s32 %r121, %r14, %r79; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r121, %r78, %r57; $L__BB0_15: setp.ge.s32 %p16, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r80, %r121, %r5; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f21, [%rd31]; ld.global.nc.f32 %f20, [%rd30]; ld.global.nc.f32 %f19, [%rd29]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r83, %r57, -1; min.s32 %r122, %r18, %r83; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r122, %r82, %r57; $L__BB0_20: add.s32 %r22, %r122, %r5; setp.ge.s32 %p20, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd32, %r22, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f27, [%rd35]; ld.global.nc.f32 %f26, [%rd34]; ld.global.nc.f32 %f25, [%rd33]; $L__BB0_22: mul.f32 %f224, %f20, %f20; fma.rn.f32 %f225, %f19, %f19, %f224; fma.rn.f32 %f31, %f21, %f21, %f225; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f229, %f39, %f39; fma.rn.f32 %f230, %f40, %f40, %f229; fma.rn.f32 %f231, %f38, %f38, %f230; setp.eq.f32 %p24, %f231, 0f00000000; mov.f32 %f456, 0f00000000; mov.f32 %f457, %f456; mov.f32 %f458, %f456; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f232, %f8, %f8; fma.rn.f32 %f233, %f7, %f7, %f232; fma.rn.f32 %f44, %f9, %f9, %f233; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f234, %f26, %f26; fma.rn.f32 %f235, %f25, %f25, %f234; fma.rn.f32 %f48, %f27, %f27, %f235; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f236, %f39, %f39; fma.rn.f32 %f237, %f40, %f40, %f236; fma.rn.f32 %f238, %f38, %f38, %f237; setp.neu.f32 %p30, %f238, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f271, %f19, %f40; sub.f32 %f272, %f20, %f39; sub.f32 %f273, %f21, %f38; mul.f32 %f458, %f273, 0f3F000000; mul.f32 %f457, %f272, 0f3F000000; mul.f32 %f456, %f271, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f458, %f3, %f38; sub.f32 %f457, %f2, %f39; sub.f32 %f456, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f239, %f39, %f39; fma.rn.f32 %f240, %f40, %f40, %f239; fma.rn.f32 %f49, %f38, %f38, %f240; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f458, %f21, %f3; sub.f32 %f457, %f20, %f2; sub.f32 %f456, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f262, %f19, %f40; sub.f32 %f263, %f20, %f39; sub.f32 %f264, %f21, %f38; sub.f32 %f265, %f7, %f25; mul.f32 %f266, %f265, 0f3DAAAAAB; sub.f32 %f267, %f8, %f26; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f9, %f27; mul.f32 %f270, %f269, 0f3DAAAAAB; fma.rn.f32 %f458, %f264, 0f3F2AAAAB, %f270; fma.rn.f32 %f457, %f263, 0f3F2AAAAB, %f268; fma.rn.f32 %f456, %f262, 0f3F2AAAAB, %f266; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f241, %f7, 0f3F000000; add.f32 %f242, %f40, %f40; sub.f32 %f243, %f241, %f242; add.f32 %f244, %f39, %f39; mul.f32 %f245, %f8, 0f3F000000; sub.f32 %f246, %f245, %f244; add.f32 %f247, %f38, %f38; mul.f32 %f248, %f9, 0f3F000000; sub.f32 %f249, %f248, %f247; fma.rn.f32 %f458, %f3, 0f3FC00000, %f249; fma.rn.f32 %f457, %f2, 0f3FC00000, %f246; fma.rn.f32 %f456, %f1, 0f3FC00000, %f243; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f250, %f25, 0f3F000000; add.f32 %f251, %f19, %f19; sub.f32 %f252, %f251, %f250; add.f32 %f253, %f20, %f20; mul.f32 %f254, %f26, 0f3F000000; sub.f32 %f255, %f253, %f254; add.f32 %f256, %f21, %f21; mul.f32 %f257, %f27, 0f3F000000; sub.f32 %f258, %f256, %f257; mul.f32 %f259, %f1, 0f3FC00000; mul.f32 %f260, %f2, 0f3FC00000; mul.f32 %f261, %f3, 0f3FC00000; sub.f32 %f458, %f258, %f261; sub.f32 %f457, %f255, %f260; sub.f32 %f456, %f252, %f259; $L__BB0_36: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r123, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r123, %r85, %r58; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r86, %r123, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd36, %r87, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f76, [%rd39]; ld.global.nc.f32 %f75, [%rd38]; ld.global.nc.f32 %f74, [%rd37]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r124, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r124, %r89, %r58; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r90, %r124, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd40, %r91, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f105, [%rd43]; ld.global.nc.f32 %f106, [%rd42]; ld.global.nc.f32 %f107, [%rd41]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r94, %r58, -1; min.s32 %r125, %r31, %r94; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r125, %r93, %r58; $L__BB0_49: setp.ge.s32 %p52, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r95, %r125, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd44, %r96, 4; add.s64 %rd45, %rd3, %rd44; add.s64 %rd46, %rd2, %rd44; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f88, [%rd47]; ld.global.nc.f32 %f87, [%rd46]; ld.global.nc.f32 %f86, [%rd45]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r99, %r58, -1; min.s32 %r126, %r35, %r99; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r126, %r98, %r58; $L__BB0_54: add.s32 %r39, %r126, %r4; setp.ge.s32 %p56, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd3, %rd48; add.s64 %rd50, %rd2, %rd48; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f94, [%rd51]; ld.global.nc.f32 %f93, [%rd50]; ld.global.nc.f32 %f92, [%rd49]; $L__BB0_56: mul.f32 %f286, %f87, %f87; fma.rn.f32 %f287, %f86, %f86, %f286; fma.rn.f32 %f98, %f88, %f88, %f287; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f291, %f106, %f106; fma.rn.f32 %f292, %f107, %f107, %f291; fma.rn.f32 %f293, %f105, %f105, %f292; setp.eq.f32 %p60, %f293, 0f00000000; mov.f32 %f474, 0f00000000; mov.f32 %f475, %f474; mov.f32 %f476, %f474; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f294, %f75, %f75; fma.rn.f32 %f295, %f74, %f74, %f294; fma.rn.f32 %f111, %f76, %f76, %f295; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f296, %f93, %f93; fma.rn.f32 %f297, %f92, %f92, %f296; fma.rn.f32 %f115, %f94, %f94, %f297; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f298, %f106, %f106; fma.rn.f32 %f299, %f107, %f107, %f298; fma.rn.f32 %f300, %f105, %f105, %f299; setp.neu.f32 %p66, %f300, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f333, %f86, %f107; sub.f32 %f334, %f87, %f106; sub.f32 %f335, %f88, %f105; mul.f32 %f476, %f335, 0f3F000000; mul.f32 %f475, %f334, 0f3F000000; mul.f32 %f474, %f333, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f476, %f3, %f105; sub.f32 %f475, %f2, %f106; sub.f32 %f474, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f301, %f106, %f106; fma.rn.f32 %f302, %f107, %f107, %f301; fma.rn.f32 %f116, %f105, %f105, %f302; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f476, %f88, %f3; sub.f32 %f475, %f87, %f2; sub.f32 %f474, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f324, %f86, %f107; sub.f32 %f325, %f87, %f106; sub.f32 %f326, %f88, %f105; sub.f32 %f327, %f74, %f92; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f75, %f93; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f76, %f94; mul.f32 %f332, %f331, 0f3DAAAAAB; fma.rn.f32 %f476, %f326, 0f3F2AAAAB, %f332; fma.rn.f32 %f475, %f325, 0f3F2AAAAB, %f330; fma.rn.f32 %f474, %f324, 0f3F2AAAAB, %f328; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f303, %f74, 0f3F000000; add.f32 %f304, %f107, %f107; sub.f32 %f305, %f303, %f304; add.f32 %f306, %f106, %f106; mul.f32 %f307, %f75, 0f3F000000; sub.f32 %f308, %f307, %f306; add.f32 %f309, %f105, %f105; mul.f32 %f310, %f76, 0f3F000000; sub.f32 %f311, %f310, %f309; fma.rn.f32 %f476, %f3, 0f3FC00000, %f311; fma.rn.f32 %f475, %f2, 0f3FC00000, %f308; fma.rn.f32 %f474, %f1, 0f3FC00000, %f305; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f312, %f92, 0f3F000000; add.f32 %f313, %f86, %f86; sub.f32 %f314, %f313, %f312; add.f32 %f315, %f87, %f87; mul.f32 %f316, %f93, 0f3F000000; sub.f32 %f317, %f315, %f316; add.f32 %f318, %f88, %f88; mul.f32 %f319, %f94, 0f3F000000; sub.f32 %f320, %f318, %f319; mul.f32 %f321, %f1, 0f3FC00000; mul.f32 %f322, %f2, 0f3FC00000; mul.f32 %f323, %f3, 0f3FC00000; sub.f32 %f476, %f320, %f323; sub.f32 %f475, %f317, %f322; sub.f32 %f474, %f314, %f321; $L__BB0_70: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p79, %rs3, 0; add.s32 %r40, %r3, -2; @%p79 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: max.s32 %r127, %r40, 0; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r127, %r102, %r59; $L__BB0_73: setp.lt.s32 %p81, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p82, %p81, %p79; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p82 bra $L__BB0_75; mad.lo.s32 %r103, %r127, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd52, %r104, 4; add.s64 %rd53, %rd3, %rd52; add.s64 %rd54, %rd2, %rd52; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f143, [%rd55]; ld.global.nc.f32 %f142, [%rd54]; ld.global.nc.f32 %f141, [%rd53]; $L__BB0_75: add.s32 %r44, %r3, -1; @%p79 bra $L__BB0_77; bra.uni $L__BB0_76; $L__BB0_77: max.s32 %r128, %r44, 0; bra.uni $L__BB0_78; $L__BB0_76: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r128, %r106, %r59; $L__BB0_78: setp.lt.s32 %p84, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p86, %p84, %p79; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p86 bra $L__BB0_80; mad.lo.s32 %r107, %r128, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd3, %rd56; add.s64 %rd58, %rd2, %rd56; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f172, [%rd59]; ld.global.nc.f32 %f173, [%rd58]; ld.global.nc.f32 %f174, [%rd57]; $L__BB0_80: add.s32 %r48, %r3, 1; @%p79 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: add.s32 %r111, %r59, -1; min.s32 %r129, %r48, %r111; bra.uni $L__BB0_83; $L__BB0_81: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r129, %r110, %r59; $L__BB0_83: setp.ge.s32 %p88, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p90, %p88, %p79; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p90 bra $L__BB0_85; mad.lo.s32 %r112, %r129, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd60, %r113, 4; add.s64 %rd61, %rd3, %rd60; add.s64 %rd62, %rd2, %rd60; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f155, [%rd63]; ld.global.nc.f32 %f154, [%rd62]; ld.global.nc.f32 %f153, [%rd61]; $L__BB0_85: add.s32 %r52, %r3, 2; @%p79 bra $L__BB0_87; bra.uni $L__BB0_86; $L__BB0_87: add.s32 %r116, %r59, -1; min.s32 %r130, %r52, %r116; bra.uni $L__BB0_88; $L__BB0_86: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r130, %r115, %r59; $L__BB0_88: mad.lo.s32 %r117, %r130, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p92, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p94, %p92, %p79; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p94 bra $L__BB0_90; mul.wide.s32 %rd64, %r56, 4; add.s64 %rd65, %rd3, %rd64; add.s64 %rd66, %rd2, %rd64; add.s64 %rd67, %rd1, %rd64; ld.global.nc.f32 %f161, [%rd67]; ld.global.nc.f32 %f160, [%rd66]; ld.global.nc.f32 %f159, [%rd65]; $L__BB0_90: mul.f32 %f348, %f154, %f154; fma.rn.f32 %f349, %f153, %f153, %f348; fma.rn.f32 %f165, %f155, %f155, %f349; setp.eq.f32 %p95, %f165, 0f00000000; @%p95 bra $L__BB0_91; bra.uni $L__BB0_92; $L__BB0_91: mul.f32 %f353, %f173, %f173; fma.rn.f32 %f354, %f174, %f174, %f353; fma.rn.f32 %f355, %f172, %f172, %f354; setp.eq.f32 %p96, %f355, 0f00000000; mov.f32 %f492, 0f00000000; mov.f32 %f493, %f492; mov.f32 %f494, %f492; @%p96 bra $L__BB0_104; $L__BB0_92: mul.f32 %f356, %f142, %f142; fma.rn.f32 %f357, %f141, %f141, %f356; fma.rn.f32 %f178, %f143, %f143, %f357; setp.neu.f32 %p97, %f178, 0f00000000; mul.f32 %f358, %f160, %f160; fma.rn.f32 %f359, %f159, %f159, %f358; fma.rn.f32 %f182, %f161, %f161, %f359; setp.neu.f32 %p98, %f182, 0f00000000; and.pred %p99, %p97, %p98; or.pred %p101, %p95, %p99; @%p101 bra $L__BB0_94; mul.f32 %f360, %f173, %f173; fma.rn.f32 %f361, %f174, %f174, %f360; fma.rn.f32 %f362, %f172, %f172, %f361; setp.neu.f32 %p102, %f362, 0f00000000; @%p102 bra $L__BB0_103; bra.uni $L__BB0_94; $L__BB0_103: sub.f32 %f395, %f153, %f174; sub.f32 %f396, %f154, %f173; sub.f32 %f397, %f155, %f172; mul.f32 %f494, %f397, 0f3F000000; mul.f32 %f493, %f396, 0f3F000000; mul.f32 %f492, %f395, 0f3F000000; bra.uni $L__BB0_104; $L__BB0_94: setp.eq.f32 %p103, %f178, 0f00000000; and.pred %p105, %p103, %p95; @%p105 bra $L__BB0_102; bra.uni $L__BB0_95; $L__BB0_102: sub.f32 %f494, %f3, %f172; sub.f32 %f493, %f2, %f173; sub.f32 %f492, %f1, %f174; bra.uni $L__BB0_104; $L__BB0_95: setp.eq.f32 %p106, %f182, 0f00000000; mul.f32 %f363, %f173, %f173; fma.rn.f32 %f364, %f174, %f174, %f363; fma.rn.f32 %f183, %f172, %f172, %f364; setp.eq.f32 %p107, %f183, 0f00000000; and.pred %p108, %p107, %p106; @%p108 bra $L__BB0_101; bra.uni $L__BB0_96; $L__BB0_101: sub.f32 %f494, %f155, %f3; sub.f32 %f493, %f154, %f2; sub.f32 %f492, %f153, %f1; bra.uni $L__BB0_104; $L__BB0_96: setp.neu.f32 %p110, %f165, 0f00000000; or.pred %p111, %p103, %p110; @%p111 bra $L__BB0_98; bra.uni $L__BB0_97; $L__BB0_98: setp.neu.f32 %p112, %f183, 0f00000000; or.pred %p114, %p106, %p112; @%p114 bra $L__BB0_100; bra.uni $L__BB0_99; $L__BB0_100: sub.f32 %f386, %f153, %f174; sub.f32 %f387, %f154, %f173; sub.f32 %f388, %f155, %f172; sub.f32 %f389, %f141, %f159; mul.f32 %f390, %f389, 0f3DAAAAAB; sub.f32 %f391, %f142, %f160; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f143, %f161; mul.f32 %f394, %f393, 0f3DAAAAAB; fma.rn.f32 %f494, %f388, 0f3F2AAAAB, %f394; fma.rn.f32 %f493, %f387, 0f3F2AAAAB, %f392; fma.rn.f32 %f492, %f386, 0f3F2AAAAB, %f390; bra.uni $L__BB0_104; $L__BB0_97: mul.f32 %f365, %f141, 0f3F000000; add.f32 %f366, %f174, %f174; sub.f32 %f367, %f365, %f366; add.f32 %f368, %f173, %f173; mul.f32 %f369, %f142, 0f3F000000; sub.f32 %f370, %f369, %f368; add.f32 %f371, %f172, %f172; mul.f32 %f372, %f143, 0f3F000000; sub.f32 %f373, %f372, %f371; fma.rn.f32 %f494, %f3, 0f3FC00000, %f373; fma.rn.f32 %f493, %f2, 0f3FC00000, %f370; fma.rn.f32 %f492, %f1, 0f3FC00000, %f367; bra.uni $L__BB0_104; $L__BB0_99: mul.f32 %f374, %f159, 0f3F000000; add.f32 %f375, %f153, %f153; sub.f32 %f376, %f375, %f374; add.f32 %f377, %f154, %f154; mul.f32 %f378, %f160, 0f3F000000; sub.f32 %f379, %f377, %f378; add.f32 %f380, %f155, %f155; mul.f32 %f381, %f161, 0f3F000000; sub.f32 %f382, %f380, %f381; mul.f32 %f383, %f1, 0f3FC00000; mul.f32 %f384, %f2, 0f3FC00000; mul.f32 %f385, %f3, 0f3FC00000; sub.f32 %f494, %f382, %f385; sub.f32 %f493, %f379, %f384; sub.f32 %f492, %f376, %f383; $L__BB0_104: mul.f32 %f398, %f475, %f494; mul.f32 %f399, %f476, %f493; sub.f32 %f400, %f398, %f399; mul.f32 %f401, %f476, %f492; mul.f32 %f402, %f474, %f494; sub.f32 %f403, %f401, %f402; mul.f32 %f404, %f474, %f493; mul.f32 %f405, %f475, %f492; sub.f32 %f406, %f404, %f405; mul.f32 %f407, %f458, %f493; mul.f32 %f408, %f457, %f494; sub.f32 %f409, %f407, %f408; mul.f32 %f410, %f456, %f494; mul.f32 %f411, %f458, %f492; sub.f32 %f412, %f410, %f411; mul.f32 %f413, %f457, %f492; mul.f32 %f414, %f456, %f493; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f457, %f476; mul.f32 %f417, %f458, %f475; sub.f32 %f418, %f416, %f417; mul.f32 %f419, %f458, %f474; mul.f32 %f420, %f456, %f476; sub.f32 %f421, %f419, %f420; mul.f32 %f422, %f456, %f475; mul.f32 %f423, %f457, %f474; sub.f32 %f424, %f422, %f423; add.f32 %f425, %f205, %f205; mul.f32 %f426, %f425, %f206; mul.f32 %f427, %f2, %f403; fma.rn.f32 %f428, %f1, %f400, %f427; fma.rn.f32 %f429, %f3, %f406, %f428; mul.f32 %f430, %f426, %f429; st.global.f32 [%rd4], %f430; mul.f32 %f431, %f425, %f207; mul.f32 %f432, %f2, %f412; fma.rn.f32 %f433, %f1, %f409, %f432; fma.rn.f32 %f434, %f3, %f415, %f433; mul.f32 %f435, %f431, %f434; st.global.f32 [%rd5], %f435; mul.f32 %f436, %f425, %f208; mul.f32 %f437, %f2, %f421; fma.rn.f32 %f438, %f1, %f418, %f437; fma.rn.f32 %f439, %f3, %f424, %f438; mul.f32 %f440, %f436, %f439; st.global.f32 [%rd6], %f440; $L__BB0_106: ret; } ` ) 3-3.11.1/cuda/hopf-emergentmagneticfieldtwopoint.cu000066400000000000000000000114411503346766200223010ustar00rootroot00000000000000#include #include "float3.h" #include "stencil.h" // Sets the emergent magnetic field F_i = (1/8π) ε_{ijk} m · (∂m/∂x_j × ∂m/∂x_k) // See hopfindex-two-point.go extern "C" __global__ void setemergentmagneticfieldtwopoint(float* __restrict__ Fx, float* __restrict__ Fy, float* __restrict__ Fz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float prefactor, float icycz, float iczcx, float icxcy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[I], my[I], mz[I]); // +0 float3 dmdx = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂x float3 dmdy = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdz = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdy_x_dmdz = make_float3(0.0, 0.0, 0.0); // ∂m/∂y × ∂m/∂z float3 dmdz_x_dmdx = make_float3(0.0, 0.0, 0.0); // ∂m/∂z × ∂m/∂x float3 dmdx_x_dmdy = make_float3(0.0, 0.0, 0.0); // ∂m/∂x × ∂m/∂y int i_; // neighbor index if(is0(m0)) { Fx[I] = 0.0f; Fy[I] = 0.0f; Fz[I] = 0.0f; return; } // x derivatives (along length) { float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); // -1 i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); // +1 i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // system is one cell thick { dmdx = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if (is0(m_p1)) { dmdx = m0 - m_m1; // backward difference } else if (is0(m_m1)) { dmdx = -m0 + m_p1; // forward difference } else { dmdx = 0.5f * (m_p1 - m_m1); // central difference } } // y derivatives (along height) { float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // system is one cell thick { dmdy = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if (is0(m_p1)) { dmdy = m0 - m_m1; // backward difference } else if (is0(m_m1)) { dmdy = -m0 + m_p1; // forward difference } else { dmdy = 0.5f * (m_p1 - m_m1); // central difference } } // z derivatives (along depth) { float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-1)); if (iz-1 >= 0 || PBCz) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+1)); if (iz+1 < Nz || PBCz) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // system is one cell thick { dmdz = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if (is0(m_p1)) { dmdz = m0 - m_m1; // backward difference } else if (is0(m_m1)) { dmdz = -m0 + m_p1; // forward difference } else { dmdz = 0.5f * (m_p1 - m_m1); // central difference } } dmdy_x_dmdz = cross(dmdy, dmdz); dmdz_x_dmdx = cross(dmdz, dmdx); dmdx_x_dmdy = cross(dmdx, dmdy); Fx[I] = 2 * prefactor * icycz * dot(m0, dmdy_x_dmdz); Fy[I] = 2 * prefactor * iczcx * dot(m0, dmdz_x_dmdx); Fz[I] = 2 * prefactor * icxcy * dot(m0, dmdx_x_dmdy); } 3-3.11.1/cuda/hopf-emergentmagneticfieldtwopoint_wrapper.go000066400000000000000000005351711503346766200240520ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setemergentmagneticfieldtwopoint kernel var setemergentmagneticfieldtwopoint_code cu.Function // Stores the arguments for setemergentmagneticfieldtwopoint kernel invocation type setemergentmagneticfieldtwopoint_args_t struct { arg_Fx unsafe.Pointer arg_Fy unsafe.Pointer arg_Fz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_prefactor float32 arg_icycz float32 arg_iczcx float32 arg_icxcy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [14]unsafe.Pointer sync.Mutex } // Stores the arguments for setemergentmagneticfieldtwopoint kernel invocation var setemergentmagneticfieldtwopoint_args setemergentmagneticfieldtwopoint_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setemergentmagneticfieldtwopoint_args.argptr[0] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_Fx) setemergentmagneticfieldtwopoint_args.argptr[1] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_Fy) setemergentmagneticfieldtwopoint_args.argptr[2] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_Fz) setemergentmagneticfieldtwopoint_args.argptr[3] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_mx) setemergentmagneticfieldtwopoint_args.argptr[4] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_my) setemergentmagneticfieldtwopoint_args.argptr[5] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_mz) setemergentmagneticfieldtwopoint_args.argptr[6] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_prefactor) setemergentmagneticfieldtwopoint_args.argptr[7] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_icycz) setemergentmagneticfieldtwopoint_args.argptr[8] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_iczcx) setemergentmagneticfieldtwopoint_args.argptr[9] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_icxcy) setemergentmagneticfieldtwopoint_args.argptr[10] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_Nx) setemergentmagneticfieldtwopoint_args.argptr[11] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_Ny) setemergentmagneticfieldtwopoint_args.argptr[12] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_Nz) setemergentmagneticfieldtwopoint_args.argptr[13] = unsafe.Pointer(&setemergentmagneticfieldtwopoint_args.arg_PBC) } // Wrapper for setemergentmagneticfieldtwopoint CUDA kernel, asynchronous. func k_setemergentmagneticfieldtwopoint_async(Fx unsafe.Pointer, Fy unsafe.Pointer, Fz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, prefactor float32, icycz float32, iczcx float32, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("setemergentmagneticfieldtwopoint") } setemergentmagneticfieldtwopoint_args.Lock() defer setemergentmagneticfieldtwopoint_args.Unlock() if setemergentmagneticfieldtwopoint_code == 0 { setemergentmagneticfieldtwopoint_code = fatbinLoad(setemergentmagneticfieldtwopoint_map, "setemergentmagneticfieldtwopoint") } setemergentmagneticfieldtwopoint_args.arg_Fx = Fx setemergentmagneticfieldtwopoint_args.arg_Fy = Fy setemergentmagneticfieldtwopoint_args.arg_Fz = Fz setemergentmagneticfieldtwopoint_args.arg_mx = mx setemergentmagneticfieldtwopoint_args.arg_my = my setemergentmagneticfieldtwopoint_args.arg_mz = mz setemergentmagneticfieldtwopoint_args.arg_prefactor = prefactor setemergentmagneticfieldtwopoint_args.arg_icycz = icycz setemergentmagneticfieldtwopoint_args.arg_iczcx = iczcx setemergentmagneticfieldtwopoint_args.arg_icxcy = icxcy setemergentmagneticfieldtwopoint_args.arg_Nx = Nx setemergentmagneticfieldtwopoint_args.arg_Ny = Ny setemergentmagneticfieldtwopoint_args.arg_Nz = Nz setemergentmagneticfieldtwopoint_args.arg_PBC = PBC args := setemergentmagneticfieldtwopoint_args.argptr[:] cu.LaunchKernel(setemergentmagneticfieldtwopoint_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setemergentmagneticfieldtwopoint") } } // maps compute capability on PTX code for setemergentmagneticfieldtwopoint kernel. var setemergentmagneticfieldtwopoint_map = map[int]string{0: "", 50: setemergentmagneticfieldtwopoint_ptx_50, 52: setemergentmagneticfieldtwopoint_ptx_52, 53: setemergentmagneticfieldtwopoint_ptx_53, 60: setemergentmagneticfieldtwopoint_ptx_60, 61: setemergentmagneticfieldtwopoint_ptx_61, 62: setemergentmagneticfieldtwopoint_ptx_62, 70: setemergentmagneticfieldtwopoint_ptx_70, 72: setemergentmagneticfieldtwopoint_ptx_72, 75: setemergentmagneticfieldtwopoint_ptx_75, 80: setemergentmagneticfieldtwopoint_ptx_80, 86: setemergentmagneticfieldtwopoint_ptx_86, 87: setemergentmagneticfieldtwopoint_ptx_87, 89: setemergentmagneticfieldtwopoint_ptx_89, 90: setemergentmagneticfieldtwopoint_ptx_90} // setemergentmagneticfieldtwopoint PTX code for various compute capabilities. const ( setemergentmagneticfieldtwopoint_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` setemergentmagneticfieldtwopoint_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl setemergentmagneticfieldtwopoint .visible .entry setemergentmagneticfieldtwopoint( .param .u64 setemergentmagneticfieldtwopoint_param_0, .param .u64 setemergentmagneticfieldtwopoint_param_1, .param .u64 setemergentmagneticfieldtwopoint_param_2, .param .u64 setemergentmagneticfieldtwopoint_param_3, .param .u64 setemergentmagneticfieldtwopoint_param_4, .param .u64 setemergentmagneticfieldtwopoint_param_5, .param .f32 setemergentmagneticfieldtwopoint_param_6, .param .f32 setemergentmagneticfieldtwopoint_param_7, .param .f32 setemergentmagneticfieldtwopoint_param_8, .param .f32 setemergentmagneticfieldtwopoint_param_9, .param .u32 setemergentmagneticfieldtwopoint_param_10, .param .u32 setemergentmagneticfieldtwopoint_param_11, .param .u32 setemergentmagneticfieldtwopoint_param_12, .param .u8 setemergentmagneticfieldtwopoint_param_13 ) { .reg .pred %p<40>; .reg .b16 %rs<5>; .reg .f32 %f<225>; .reg .b32 %r<76>; .reg .b64 %rd<44>; ld.param.u8 %rs4, [setemergentmagneticfieldtwopoint_param_13]; ld.param.u64 %rd7, [setemergentmagneticfieldtwopoint_param_0]; ld.param.u64 %rd8, [setemergentmagneticfieldtwopoint_param_1]; ld.param.u64 %rd9, [setemergentmagneticfieldtwopoint_param_2]; ld.param.u64 %rd10, [setemergentmagneticfieldtwopoint_param_3]; ld.param.u64 %rd11, [setemergentmagneticfieldtwopoint_param_4]; ld.param.u64 %rd12, [setemergentmagneticfieldtwopoint_param_5]; ld.param.f32 %f97, [setemergentmagneticfieldtwopoint_param_6]; ld.param.f32 %f98, [setemergentmagneticfieldtwopoint_param_7]; ld.param.f32 %f99, [setemergentmagneticfieldtwopoint_param_8]; ld.param.f32 %f100, [setemergentmagneticfieldtwopoint_param_9]; ld.param.u32 %r33, [setemergentmagneticfieldtwopoint_param_10]; ld.param.u32 %r34, [setemergentmagneticfieldtwopoint_param_11]; ld.param.u32 %r35, [setemergentmagneticfieldtwopoint_param_12]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r37, %r36, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r40, %r39, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r43, %r42, %r44; setp.ge.s32 %p1, %r1, %r33; setp.ge.s32 %p2, %r2, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_52; mul.lo.s32 %r4, %r3, %r34; add.s32 %r45, %r4, %r2; mul.lo.s32 %r5, %r45, %r33; add.s32 %r46, %r5, %r1; mul.wide.s32 %rd13, %r46, 4; add.s64 %rd14, %rd3, %rd13; add.s64 %rd15, %rd2, %rd13; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; mul.f32 %f101, %f2, %f2; fma.rn.f32 %f102, %f1, %f1, %f101; ld.global.nc.f32 %f3, [%rd16]; fma.rn.f32 %f103, %f3, %f3, %f102; setp.eq.f32 %p6, %f103, 0f00000000; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd4, %rd17, %rd13; cvta.to.global.u64 %rd18, %rd8; add.s64 %rd5, %rd18, %rd13; cvta.to.global.u64 %rd19, %rd9; add.s64 %rd6, %rd19, %rd13; @%p6 bra $L__BB0_51; bra.uni $L__BB0_2; $L__BB0_51: mov.u32 %r69, 0; st.global.u32 [%rd4], %r69; st.global.u32 [%rd5], %r69; st.global.u32 [%rd6], %r69; bra.uni $L__BB0_52; $L__BB0_2: and.b16 %rs1, %rs4, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r70, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r47, %r6, %r33; add.s32 %r48, %r47, %r33; rem.s32 %r70, %r48, %r33; $L__BB0_5: setp.lt.s32 %p9, %r1, 1; mov.f32 %f198, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f199, %f198; mov.f32 %f200, %f198; @%p10 bra $L__BB0_7; add.s32 %r49, %r70, %r5; mul.wide.s32 %rd20, %r49, 4; add.s64 %rd21, %rd3, %rd20; add.s64 %rd22, %rd2, %rd20; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f200, [%rd23]; ld.global.nc.f32 %f199, [%rd22]; ld.global.nc.f32 %f198, [%rd21]; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r52, %r33, -1; min.s32 %r71, %r10, %r52; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r50, %r10, %r33; add.s32 %r51, %r50, %r33; rem.s32 %r71, %r51, %r33; $L__BB0_10: add.s32 %r14, %r71, %r5; setp.ge.s32 %p12, %r10, %r33; mov.f32 %f201, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f202, %f201; mov.f32 %f203, %f201; @%p14 bra $L__BB0_12; mul.wide.s32 %rd24, %r14, 4; add.s64 %rd25, %rd3, %rd24; add.s64 %rd26, %rd2, %rd24; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f203, [%rd27]; ld.global.nc.f32 %f202, [%rd26]; ld.global.nc.f32 %f201, [%rd25]; $L__BB0_12: mul.f32 %f110, %f202, %f202; fma.rn.f32 %f111, %f201, %f201, %f110; fma.rn.f32 %f112, %f203, %f203, %f111; setp.eq.f32 %p15, %f112, 0f00000000; mul.f32 %f113, %f199, %f199; fma.rn.f32 %f114, %f198, %f198, %f113; fma.rn.f32 %f22, %f200, %f200, %f114; @%p15 bra $L__BB0_16; bra.uni $L__BB0_13; $L__BB0_16: setp.eq.f32 %p17, %f22, 0f00000000; mov.f32 %f204, 0f00000000; mov.f32 %f205, %f204; mov.f32 %f206, %f204; @%p17 bra $L__BB0_18; sub.f32 %f206, %f3, %f200; sub.f32 %f205, %f2, %f199; sub.f32 %f204, %f1, %f198; bra.uni $L__BB0_18; $L__BB0_13: setp.eq.f32 %p16, %f22, 0f00000000; @%p16 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: sub.f32 %f206, %f203, %f3; sub.f32 %f205, %f202, %f2; sub.f32 %f204, %f201, %f1; bra.uni $L__BB0_18; $L__BB0_14: sub.f32 %f115, %f201, %f198; sub.f32 %f116, %f202, %f199; sub.f32 %f117, %f203, %f200; mul.f32 %f206, %f117, 0f3F000000; mul.f32 %f205, %f116, 0f3F000000; mul.f32 %f204, %f115, 0f3F000000; $L__BB0_18: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p18, %rs2, 0; add.s32 %r15, %r2, -1; @%p18 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: max.s32 %r72, %r15, 0; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r53, %r15, %r34; add.s32 %r54, %r53, %r34; rem.s32 %r72, %r54, %r34; $L__BB0_21: setp.lt.s32 %p20, %r2, 1; mov.f32 %f207, 0f00000000; and.pred %p21, %p20, %p18; mov.f32 %f208, %f207; mov.f32 %f209, %f207; @%p21 bra $L__BB0_23; add.s32 %r55, %r72, %r4; mad.lo.s32 %r56, %r55, %r33, %r1; mul.wide.s32 %rd28, %r56, 4; add.s64 %rd29, %rd3, %rd28; add.s64 %rd30, %rd2, %rd28; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f209, [%rd31]; ld.global.nc.f32 %f208, [%rd30]; ld.global.nc.f32 %f207, [%rd29]; $L__BB0_23: add.s32 %r19, %r2, 1; @%p18 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: add.s32 %r59, %r34, -1; min.s32 %r73, %r19, %r59; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r57, %r19, %r34; add.s32 %r58, %r57, %r34; rem.s32 %r73, %r58, %r34; $L__BB0_26: add.s32 %r23, %r73, %r4; setp.ge.s32 %p23, %r19, %r34; mov.f32 %f210, 0f00000000; and.pred %p25, %p23, %p18; mov.f32 %f211, %f210; mov.f32 %f212, %f210; @%p25 bra $L__BB0_28; mad.lo.s32 %r60, %r23, %r33, %r1; mul.wide.s32 %rd32, %r60, 4; add.s64 %rd33, %rd3, %rd32; add.s64 %rd34, %rd2, %rd32; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f212, [%rd35]; ld.global.nc.f32 %f211, [%rd34]; ld.global.nc.f32 %f210, [%rd33]; $L__BB0_28: mul.f32 %f127, %f211, %f211; fma.rn.f32 %f128, %f210, %f210, %f127; fma.rn.f32 %f129, %f212, %f212, %f128; setp.eq.f32 %p26, %f129, 0f00000000; mul.f32 %f130, %f208, %f208; fma.rn.f32 %f131, %f207, %f207, %f130; fma.rn.f32 %f53, %f209, %f209, %f131; @%p26 bra $L__BB0_32; bra.uni $L__BB0_29; $L__BB0_32: setp.eq.f32 %p28, %f53, 0f00000000; mov.f32 %f213, 0f00000000; mov.f32 %f214, %f213; mov.f32 %f215, %f213; @%p28 bra $L__BB0_34; sub.f32 %f215, %f3, %f209; sub.f32 %f214, %f2, %f208; sub.f32 %f213, %f1, %f207; bra.uni $L__BB0_34; $L__BB0_29: setp.eq.f32 %p27, %f53, 0f00000000; @%p27 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f215, %f212, %f3; sub.f32 %f214, %f211, %f2; sub.f32 %f213, %f210, %f1; bra.uni $L__BB0_34; $L__BB0_30: sub.f32 %f132, %f210, %f207; sub.f32 %f133, %f211, %f208; sub.f32 %f134, %f212, %f209; mul.f32 %f215, %f134, 0f3F000000; mul.f32 %f214, %f133, 0f3F000000; mul.f32 %f213, %f132, 0f3F000000; $L__BB0_34: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r24, %r3, -1; @%p29 bra $L__BB0_36; bra.uni $L__BB0_35; $L__BB0_36: max.s32 %r74, %r24, 0; bra.uni $L__BB0_37; $L__BB0_35: rem.s32 %r61, %r24, %r35; add.s32 %r62, %r61, %r35; rem.s32 %r74, %r62, %r35; $L__BB0_37: setp.lt.s32 %p31, %r3, 1; mov.f32 %f216, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f217, %f216; mov.f32 %f218, %f216; @%p32 bra $L__BB0_39; mad.lo.s32 %r63, %r74, %r34, %r2; mad.lo.s32 %r64, %r63, %r33, %r1; mul.wide.s32 %rd36, %r64, 4; add.s64 %rd37, %rd3, %rd36; add.s64 %rd38, %rd2, %rd36; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f218, [%rd39]; ld.global.nc.f32 %f217, [%rd38]; ld.global.nc.f32 %f216, [%rd37]; $L__BB0_39: add.s32 %r28, %r3, 1; @%p29 bra $L__BB0_41; bra.uni $L__BB0_40; $L__BB0_41: add.s32 %r67, %r35, -1; min.s32 %r75, %r28, %r67; bra.uni $L__BB0_42; $L__BB0_40: rem.s32 %r65, %r28, %r35; add.s32 %r66, %r65, %r35; rem.s32 %r75, %r66, %r35; $L__BB0_42: mad.lo.s32 %r68, %r75, %r34, %r2; mad.lo.s32 %r32, %r68, %r33, %r1; setp.ge.s32 %p34, %r28, %r35; mov.f32 %f219, 0f00000000; and.pred %p36, %p34, %p29; mov.f32 %f220, %f219; mov.f32 %f221, %f219; @%p36 bra $L__BB0_44; mul.wide.s32 %rd40, %r32, 4; add.s64 %rd41, %rd3, %rd40; add.s64 %rd42, %rd2, %rd40; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f221, [%rd43]; ld.global.nc.f32 %f220, [%rd42]; ld.global.nc.f32 %f219, [%rd41]; $L__BB0_44: mul.f32 %f144, %f220, %f220; fma.rn.f32 %f145, %f219, %f219, %f144; fma.rn.f32 %f146, %f221, %f221, %f145; setp.eq.f32 %p37, %f146, 0f00000000; mul.f32 %f147, %f217, %f217; fma.rn.f32 %f148, %f216, %f216, %f147; fma.rn.f32 %f84, %f218, %f218, %f148; @%p37 bra $L__BB0_48; bra.uni $L__BB0_45; $L__BB0_48: setp.eq.f32 %p39, %f84, 0f00000000; mov.f32 %f222, 0f00000000; mov.f32 %f223, %f222; mov.f32 %f224, %f222; @%p39 bra $L__BB0_50; sub.f32 %f224, %f3, %f218; sub.f32 %f223, %f2, %f217; sub.f32 %f222, %f1, %f216; bra.uni $L__BB0_50; $L__BB0_45: setp.eq.f32 %p38, %f84, 0f00000000; @%p38 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: sub.f32 %f224, %f221, %f3; sub.f32 %f223, %f220, %f2; sub.f32 %f222, %f219, %f1; bra.uni $L__BB0_50; $L__BB0_46: sub.f32 %f149, %f219, %f216; sub.f32 %f150, %f220, %f217; sub.f32 %f151, %f221, %f218; mul.f32 %f224, %f151, 0f3F000000; mul.f32 %f223, %f150, 0f3F000000; mul.f32 %f222, %f149, 0f3F000000; $L__BB0_50: mul.f32 %f155, %f214, %f224; mul.f32 %f156, %f215, %f223; sub.f32 %f157, %f155, %f156; mul.f32 %f158, %f215, %f222; mul.f32 %f159, %f213, %f224; sub.f32 %f160, %f158, %f159; mul.f32 %f161, %f213, %f223; mul.f32 %f162, %f214, %f222; sub.f32 %f163, %f161, %f162; mul.f32 %f164, %f206, %f223; mul.f32 %f165, %f205, %f224; sub.f32 %f166, %f164, %f165; mul.f32 %f167, %f204, %f224; mul.f32 %f168, %f206, %f222; sub.f32 %f169, %f167, %f168; mul.f32 %f170, %f205, %f222; mul.f32 %f171, %f204, %f223; sub.f32 %f172, %f170, %f171; mul.f32 %f173, %f205, %f215; mul.f32 %f174, %f206, %f214; sub.f32 %f175, %f173, %f174; mul.f32 %f176, %f206, %f213; mul.f32 %f177, %f204, %f215; sub.f32 %f178, %f176, %f177; mul.f32 %f179, %f204, %f214; mul.f32 %f180, %f205, %f213; sub.f32 %f181, %f179, %f180; add.f32 %f182, %f97, %f97; mul.f32 %f183, %f182, %f98; mul.f32 %f184, %f2, %f160; fma.rn.f32 %f185, %f1, %f157, %f184; fma.rn.f32 %f186, %f3, %f163, %f185; mul.f32 %f187, %f183, %f186; st.global.f32 [%rd4], %f187; mul.f32 %f188, %f182, %f99; mul.f32 %f189, %f2, %f169; fma.rn.f32 %f190, %f1, %f166, %f189; fma.rn.f32 %f191, %f3, %f172, %f190; mul.f32 %f192, %f188, %f191; st.global.f32 [%rd5], %f192; mul.f32 %f193, %f182, %f100; mul.f32 %f194, %f2, %f178; fma.rn.f32 %f195, %f1, %f175, %f194; fma.rn.f32 %f196, %f3, %f181, %f195; mul.f32 %f197, %f193, %f196; st.global.f32 [%rd6], %f197; $L__BB0_52: ret; } ` ) 3-3.11.1/cuda/hopf-vectorpotential.cu000066400000000000000000000025131503346766200173750ustar00rootroot00000000000000#include #include "float3.h" #include "stencil.h" // Calculate the vector potential in the gauge that // A_x = ∫_-∞^y F_z dy' // A_y = 0 // A_z = -∫_-∞^y F_x dy' // We approximate these integrals using cumulative sums from the bottom of the system // (i.e. minimum value of y) up to the cell at which A is to be calculated // e.g. A_x = ∫_-∞^y F_z dy' ≈ Σ_{iy_ = 0}^{iy_ = iy-1} F_z(ix, iy_, iz) * cy extern "C" __global__ void setvectorpotential(float* __restrict__ Ax, float* __restrict__ Ay, float* __restrict__ Az, float* __restrict__ Fx, float* __restrict__ Fy, float* __restrict__ Fz, float cy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // Index of cell of interest int i_; // Index of summand float3 a = make_float3(0.0f, 0.0f, 0.0f); for (int iy_ = 0; iy_ < iy; iy_++) { // Cumulative sum along y-axis up to cell of interest within system i_ = idx(ix, iy_, iz); a.x -= Fz[i_] * cy; a.z += Fx[i_] * cy; } Ax[I] = a.x; Ay[I] = a.y; Az[I] = a.z; } 3-3.11.1/cuda/hopf-vectorpotential_wrapper.go000066400000000000000000002022531503346766200211360ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setvectorpotential kernel var setvectorpotential_code cu.Function // Stores the arguments for setvectorpotential kernel invocation type setvectorpotential_args_t struct { arg_Ax unsafe.Pointer arg_Ay unsafe.Pointer arg_Az unsafe.Pointer arg_Fx unsafe.Pointer arg_Fy unsafe.Pointer arg_Fz unsafe.Pointer arg_cy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for setvectorpotential kernel invocation var setvectorpotential_args setvectorpotential_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setvectorpotential_args.argptr[0] = unsafe.Pointer(&setvectorpotential_args.arg_Ax) setvectorpotential_args.argptr[1] = unsafe.Pointer(&setvectorpotential_args.arg_Ay) setvectorpotential_args.argptr[2] = unsafe.Pointer(&setvectorpotential_args.arg_Az) setvectorpotential_args.argptr[3] = unsafe.Pointer(&setvectorpotential_args.arg_Fx) setvectorpotential_args.argptr[4] = unsafe.Pointer(&setvectorpotential_args.arg_Fy) setvectorpotential_args.argptr[5] = unsafe.Pointer(&setvectorpotential_args.arg_Fz) setvectorpotential_args.argptr[6] = unsafe.Pointer(&setvectorpotential_args.arg_cy) setvectorpotential_args.argptr[7] = unsafe.Pointer(&setvectorpotential_args.arg_Nx) setvectorpotential_args.argptr[8] = unsafe.Pointer(&setvectorpotential_args.arg_Ny) setvectorpotential_args.argptr[9] = unsafe.Pointer(&setvectorpotential_args.arg_Nz) setvectorpotential_args.argptr[10] = unsafe.Pointer(&setvectorpotential_args.arg_PBC) } // Wrapper for setvectorpotential CUDA kernel, asynchronous. func k_setvectorpotential_async(Ax unsafe.Pointer, Ay unsafe.Pointer, Az unsafe.Pointer, Fx unsafe.Pointer, Fy unsafe.Pointer, Fz unsafe.Pointer, cy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("setvectorpotential") } setvectorpotential_args.Lock() defer setvectorpotential_args.Unlock() if setvectorpotential_code == 0 { setvectorpotential_code = fatbinLoad(setvectorpotential_map, "setvectorpotential") } setvectorpotential_args.arg_Ax = Ax setvectorpotential_args.arg_Ay = Ay setvectorpotential_args.arg_Az = Az setvectorpotential_args.arg_Fx = Fx setvectorpotential_args.arg_Fy = Fy setvectorpotential_args.arg_Fz = Fz setvectorpotential_args.arg_cy = cy setvectorpotential_args.arg_Nx = Nx setvectorpotential_args.arg_Ny = Ny setvectorpotential_args.arg_Nz = Nz setvectorpotential_args.arg_PBC = PBC args := setvectorpotential_args.argptr[:] cu.LaunchKernel(setvectorpotential_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setvectorpotential") } } // maps compute capability on PTX code for setvectorpotential kernel. var setvectorpotential_map = map[int]string{0: "", 50: setvectorpotential_ptx_50, 52: setvectorpotential_ptx_52, 53: setvectorpotential_ptx_53, 60: setvectorpotential_ptx_60, 61: setvectorpotential_ptx_61, 62: setvectorpotential_ptx_62, 70: setvectorpotential_ptx_70, 72: setvectorpotential_ptx_72, 75: setvectorpotential_ptx_75, 80: setvectorpotential_ptx_80, 86: setvectorpotential_ptx_86, 87: setvectorpotential_ptx_87, 89: setvectorpotential_ptx_89, 90: setvectorpotential_ptx_90} // setvectorpotential PTX code for various compute capabilities. const ( setvectorpotential_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` setvectorpotential_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl setvectorpotential .visible .entry setvectorpotential( .param .u64 setvectorpotential_param_0, .param .u64 setvectorpotential_param_1, .param .u64 setvectorpotential_param_2, .param .u64 setvectorpotential_param_3, .param .u64 setvectorpotential_param_4, .param .u64 setvectorpotential_param_5, .param .f32 setvectorpotential_param_6, .param .u32 setvectorpotential_param_7, .param .u32 setvectorpotential_param_8, .param .u32 setvectorpotential_param_9, .param .u8 setvectorpotential_param_10 ) { .reg .pred %p<11>; .reg .f32 %f<54>; .reg .b32 %r<48>; .reg .b64 %rd<43>; ld.param.u64 %rd18, [setvectorpotential_param_0]; ld.param.u64 %rd19, [setvectorpotential_param_1]; ld.param.u64 %rd20, [setvectorpotential_param_2]; ld.param.u64 %rd21, [setvectorpotential_param_3]; ld.param.u64 %rd22, [setvectorpotential_param_5]; ld.param.f32 %f15, [setvectorpotential_param_6]; ld.param.u32 %r20, [setvectorpotential_param_7]; ld.param.u32 %r21, [setvectorpotential_param_8]; ld.param.u32 %r22, [setvectorpotential_param_9]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r23, %ntid.x; mov.u32 %r24, %ctaid.x; mul.lo.s32 %r1, %r24, %r23; mov.u32 %r2, %tid.x; add.s32 %r3, %r1, %r2; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, %tid.y; mad.lo.s32 %r4, %r26, %r25, %r27; mov.u32 %r28, %ntid.z; mov.u32 %r29, %ctaid.z; mov.u32 %r30, %tid.z; mad.lo.s32 %r5, %r29, %r28, %r30; setp.ge.s32 %p1, %r3, %r20; setp.ge.s32 %p2, %r4, %r21; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r5, %r22; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_9; mul.lo.s32 %r6, %r5, %r21; setp.lt.s32 %p6, %r4, 1; mov.f32 %f52, 0f00000000; mov.f32 %f53, %f52; @%p6 bra $L__BB0_8; add.s32 %r32, %r4, -1; and.b32 %r47, %r4, 3; setp.lt.u32 %p7, %r32, 3; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; mov.f32 %f52, %f53; @%p7 bra $L__BB0_5; sub.s32 %r45, %r4, %r47; add.s32 %r34, %r6, 1; mad.lo.s32 %r43, %r20, %r34, %r3; mul.lo.s32 %r35, %r21, %r20; mad.lo.s32 %r36, %r35, %r5, %r2; add.s32 %r37, %r36, %r1; mul.wide.s32 %rd23, %r37, 4; add.s64 %rd40, %rd1, %rd23; shl.b32 %r10, %r20, 2; mul.wide.s32 %rd4, %r10, 4; add.s64 %rd39, %rd2, %rd23; mul.wide.s32 %rd6, %r20, 4; mov.f32 %f53, 0f00000000; mov.u32 %r46, 0; $L__BB0_4: ld.global.nc.f32 %f23, [%rd39]; mul.f32 %f24, %f23, %f15; sub.f32 %f25, %f53, %f24; ld.global.nc.f32 %f26, [%rd40]; fma.rn.f32 %f27, %f26, %f15, %f52; mul.wide.s32 %rd24, %r43, 4; add.s64 %rd25, %rd2, %rd24; ld.global.nc.f32 %f28, [%rd25]; mul.f32 %f29, %f28, %f15; sub.f32 %f30, %f25, %f29; add.s64 %rd26, %rd1, %rd24; ld.global.nc.f32 %f31, [%rd26]; fma.rn.f32 %f32, %f31, %f15, %f27; add.s64 %rd27, %rd25, %rd6; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f34, %f33, %f15; sub.f32 %f35, %f30, %f34; add.s64 %rd28, %rd26, %rd6; ld.global.nc.f32 %f36, [%rd28]; fma.rn.f32 %f37, %f36, %f15, %f32; add.s64 %rd29, %rd27, %rd6; ld.global.nc.f32 %f38, [%rd29]; mul.f32 %f39, %f38, %f15; sub.f32 %f53, %f35, %f39; add.s64 %rd30, %rd28, %rd6; ld.global.nc.f32 %f40, [%rd30]; fma.rn.f32 %f52, %f40, %f15, %f37; add.s32 %r46, %r46, 4; add.s32 %r43, %r43, %r10; add.s64 %rd40, %rd40, %rd4; add.s64 %rd39, %rd39, %rd4; add.s32 %r45, %r45, -4; setp.ne.s32 %p8, %r45, 0; @%p8 bra $L__BB0_4; $L__BB0_5: setp.eq.s32 %p9, %r47, 0; @%p9 bra $L__BB0_8; add.s32 %r38, %r46, %r6; mad.lo.s32 %r39, %r20, %r38, %r3; mul.wide.s32 %rd31, %r39, 4; add.s64 %rd42, %rd1, %rd31; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd41, %rd2, %rd31; $L__BB0_7: .pragma "nounroll"; ld.global.nc.f32 %f41, [%rd41]; mul.f32 %f42, %f41, %f15; sub.f32 %f53, %f53, %f42; ld.global.nc.f32 %f43, [%rd42]; fma.rn.f32 %f52, %f43, %f15, %f52; add.s64 %rd42, %rd42, %rd12; add.s64 %rd41, %rd41, %rd12; add.s32 %r47, %r47, -1; setp.ne.s32 %p10, %r47, 0; @%p10 bra $L__BB0_7; $L__BB0_8: add.s32 %r40, %r6, %r4; mad.lo.s32 %r41, %r40, %r20, %r3; cvta.to.global.u64 %rd32, %rd18; mul.wide.s32 %rd33, %r41, 4; add.s64 %rd34, %rd32, %rd33; st.global.f32 [%rd34], %f53; cvta.to.global.u64 %rd35, %rd19; add.s64 %rd36, %rd35, %rd33; mov.u32 %r42, 0; st.global.u32 [%rd36], %r42; cvta.to.global.u64 %rd37, %rd20; add.s64 %rd38, %rd37, %rd33; st.global.f32 [%rd38], %f52; $L__BB0_9: ret; } ` ) 3-3.11.1/cuda/hopfindex-five-point.go000066400000000000000000000032021503346766200172550ustar00rootroot00000000000000package cuda import ( "math" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func SetHopfIndexDensity_FivePointStencil(h, m *data.Slice, mesh *data.Mesh) { N := m.Size() // Create buffers to store emergent field F and vector potential A F := Buffer(3, N) defer Recycle(F) A := Buffer(3, N) defer Recycle(A) // Get Hopf index density F · A SetEmergentMagneticField_FivePointStencil(F, m, mesh) SetVectorPotential(A, F, mesh) AddDotProduct(h, 1.0, F, A) } // Sets the emergent magnetic field F_i = (1/8π) ε_{ijk} m · (∂m/∂x_j × ∂m/∂x_k) // See hopf-emergentmagneticfieldfivepoint.cu func SetEmergentMagneticField_FivePointStencil(F, m *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := F.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) icycz := float32(1.0 / (cellsize[Y] * cellsize[Z])) iczcx := float32(1.0 / (cellsize[Z] * cellsize[X])) icxcy := float32(1.0 / (cellsize[X] * cellsize[Y])) prefactor := float32(1.0 / (8 * math.Pi)) k_setemergentmagneticfieldfivepoint_async(F.DevPtr(X), F.DevPtr(Y), F.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), prefactor, icycz, iczcx, icxcy, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } // Sets the vector potential A corresponding to the emergent magnetic field F such that F = ∇ × A // See hopf-vectorpotential.cu func SetVectorPotential(A, F *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := A.Size() util.Argument(F.Size() == N) cfg := make3DConf(N) k_setvectorpotential_async(A.DevPtr(X), A.DevPtr(Y), A.DevPtr(Z), F.DevPtr(X), F.DevPtr(Y), F.DevPtr(Z), float32(cellsize[Y]), N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } 3-3.11.1/cuda/hopfindex-solidangle-fourier-field.cu000066400000000000000000000033201503346766200220530ustar00rootroot00000000000000// Reconstructs the full Fourier transformed field array for negative wavenumbers using Hermitian symmetry F(-k_x, -k_y, -k_z) = F(k_x, k_y, k_z)^* extern "C" __global__ void solidanglefourierfield(float* __restrict__ fftFx_partial, float* __restrict__ fftFy_partial, float* __restrict__ fftFz_partial, float* __restrict__ fftFx, float* __restrict__ fftFy, float* __restrict__ fftFz, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix>= Nx || iy>= Ny || iz>=Nz) { return; } int I = (iz*Ny + iy)*Nx + ix; int e = 2 * I; int I_partial = (iz*Ny + iy)*(Nx/2+1) + ix; int e_partial = 2 * I_partial; if (ix <= Nx/2) { fftFx[e ] = fftFx_partial[e_partial ]; fftFx[e+1] = fftFx_partial[e_partial+1]; fftFy[e ] = fftFy_partial[e_partial ]; fftFy[e+1] = fftFy_partial[e_partial+1]; fftFz[e ] = fftFz_partial[e_partial ]; fftFz[e+1] = fftFz_partial[e_partial+1]; } else { int ix_neg = (Nx - ix) % Nx; int iy_neg = (Ny - iy) % Ny; int iz_neg = (Nz - iz) % Nz; int I_neg = (iz_neg*Ny + iy_neg)*(Nx/2+1) + ix_neg; int e_neg = 2 * I_neg; // Fill in the rest of the values using Hermitian symmetry: F(-k_x, -k_y, -k_z) = F(k_x, k_y, k_z)^* fftFx[e ] = fftFx_partial[e_neg ]; fftFx[e+1] = -fftFx_partial[e_neg+1]; fftFy[e ] = fftFy_partial[e_neg ]; fftFy[e+1] = -fftFy_partial[e_neg+1]; fftFz[e ] = fftFz_partial[e_neg ]; fftFz[e+1] = -fftFz_partial[e_neg+1]; } } 3-3.11.1/cuda/hopfindex-solidangle-fourier-field_wrapper.go000066400000000000000000001543271503346766200236270ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for solidanglefourierfield kernel var solidanglefourierfield_code cu.Function // Stores the arguments for solidanglefourierfield kernel invocation type solidanglefourierfield_args_t struct { arg_fftFx_partial unsafe.Pointer arg_fftFy_partial unsafe.Pointer arg_fftFz_partial unsafe.Pointer arg_fftFx unsafe.Pointer arg_fftFy unsafe.Pointer arg_fftFz unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for solidanglefourierfield kernel invocation var solidanglefourierfield_args solidanglefourierfield_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. solidanglefourierfield_args.argptr[0] = unsafe.Pointer(&solidanglefourierfield_args.arg_fftFx_partial) solidanglefourierfield_args.argptr[1] = unsafe.Pointer(&solidanglefourierfield_args.arg_fftFy_partial) solidanglefourierfield_args.argptr[2] = unsafe.Pointer(&solidanglefourierfield_args.arg_fftFz_partial) solidanglefourierfield_args.argptr[3] = unsafe.Pointer(&solidanglefourierfield_args.arg_fftFx) solidanglefourierfield_args.argptr[4] = unsafe.Pointer(&solidanglefourierfield_args.arg_fftFy) solidanglefourierfield_args.argptr[5] = unsafe.Pointer(&solidanglefourierfield_args.arg_fftFz) solidanglefourierfield_args.argptr[6] = unsafe.Pointer(&solidanglefourierfield_args.arg_Nx) solidanglefourierfield_args.argptr[7] = unsafe.Pointer(&solidanglefourierfield_args.arg_Ny) solidanglefourierfield_args.argptr[8] = unsafe.Pointer(&solidanglefourierfield_args.arg_Nz) } // Wrapper for solidanglefourierfield CUDA kernel, asynchronous. func k_solidanglefourierfield_async(fftFx_partial unsafe.Pointer, fftFy_partial unsafe.Pointer, fftFz_partial unsafe.Pointer, fftFx unsafe.Pointer, fftFy unsafe.Pointer, fftFz unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("solidanglefourierfield") } solidanglefourierfield_args.Lock() defer solidanglefourierfield_args.Unlock() if solidanglefourierfield_code == 0 { solidanglefourierfield_code = fatbinLoad(solidanglefourierfield_map, "solidanglefourierfield") } solidanglefourierfield_args.arg_fftFx_partial = fftFx_partial solidanglefourierfield_args.arg_fftFy_partial = fftFy_partial solidanglefourierfield_args.arg_fftFz_partial = fftFz_partial solidanglefourierfield_args.arg_fftFx = fftFx solidanglefourierfield_args.arg_fftFy = fftFy solidanglefourierfield_args.arg_fftFz = fftFz solidanglefourierfield_args.arg_Nx = Nx solidanglefourierfield_args.arg_Ny = Ny solidanglefourierfield_args.arg_Nz = Nz args := solidanglefourierfield_args.argptr[:] cu.LaunchKernel(solidanglefourierfield_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("solidanglefourierfield") } } // maps compute capability on PTX code for solidanglefourierfield kernel. var solidanglefourierfield_map = map[int]string{0: "", 50: solidanglefourierfield_ptx_50, 52: solidanglefourierfield_ptx_52, 53: solidanglefourierfield_ptx_53, 60: solidanglefourierfield_ptx_60, 61: solidanglefourierfield_ptx_61, 62: solidanglefourierfield_ptx_62, 70: solidanglefourierfield_ptx_70, 72: solidanglefourierfield_ptx_72, 75: solidanglefourierfield_ptx_75, 80: solidanglefourierfield_ptx_80, 86: solidanglefourierfield_ptx_86, 87: solidanglefourierfield_ptx_87, 89: solidanglefourierfield_ptx_89, 90: solidanglefourierfield_ptx_90} // solidanglefourierfield PTX code for various compute capabilities. const ( solidanglefourierfield_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` solidanglefourierfield_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl solidanglefourierfield .visible .entry solidanglefourierfield( .param .u64 solidanglefourierfield_param_0, .param .u64 solidanglefourierfield_param_1, .param .u64 solidanglefourierfield_param_2, .param .u64 solidanglefourierfield_param_3, .param .u64 solidanglefourierfield_param_4, .param .u64 solidanglefourierfield_param_5, .param .u32 solidanglefourierfield_param_6, .param .u32 solidanglefourierfield_param_7, .param .u32 solidanglefourierfield_param_8 ) { .reg .pred %p<7>; .reg .f32 %f<16>; .reg .b32 %r<34>; .reg .b64 %rd<25>; ld.param.u64 %rd10, [solidanglefourierfield_param_0]; ld.param.u64 %rd11, [solidanglefourierfield_param_1]; ld.param.u64 %rd12, [solidanglefourierfield_param_2]; ld.param.u64 %rd7, [solidanglefourierfield_param_3]; ld.param.u64 %rd8, [solidanglefourierfield_param_4]; ld.param.u64 %rd9, [solidanglefourierfield_param_5]; ld.param.u32 %r6, [solidanglefourierfield_param_6]; ld.param.u32 %r7, [solidanglefourierfield_param_7]; ld.param.u32 %r8, [solidanglefourierfield_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; mad.lo.s32 %r4, %r3, %r7, %r2; mad.lo.s32 %r18, %r4, %r6, %r1; shl.b32 %r19, %r18, 1; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; add.s32 %r5, %r22, 1; setp.gt.s32 %p6, %r1, %r22; cvta.to.global.u64 %rd13, %rd7; mul.wide.s32 %rd14, %r19, 4; add.s64 %rd4, %rd13, %rd14; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd5, %rd15, %rd14; cvta.to.global.u64 %rd16, %rd9; add.s64 %rd6, %rd16, %rd14; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: sub.s32 %r25, %r6, %r1; rem.s32 %r26, %r25, %r6; sub.s32 %r27, %r7, %r2; rem.s32 %r28, %r27, %r7; sub.s32 %r29, %r8, %r3; rem.s32 %r30, %r29, %r8; mad.lo.s32 %r31, %r30, %r7, %r28; mad.lo.s32 %r32, %r31, %r5, %r26; shl.b32 %r33, %r32, 1; mul.wide.s32 %rd21, %r33, 4; add.s64 %rd22, %rd3, %rd21; ld.global.nc.f32 %f7, [%rd22]; st.global.f32 [%rd4], %f7; ld.global.nc.f32 %f8, [%rd22+4]; neg.f32 %f9, %f8; st.global.f32 [%rd4+4], %f9; add.s64 %rd23, %rd2, %rd21; ld.global.nc.f32 %f10, [%rd23]; st.global.f32 [%rd5], %f10; ld.global.nc.f32 %f11, [%rd23+4]; neg.f32 %f12, %f11; st.global.f32 [%rd5+4], %f12; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f13, [%rd24]; st.global.f32 [%rd6], %f13; ld.global.nc.f32 %f14, [%rd24+4]; neg.f32 %f15, %f14; st.global.f32 [%rd6+4], %f15; bra.uni $L__BB0_4; $L__BB0_2: mad.lo.s32 %r23, %r4, %r5, %r1; shl.b32 %r24, %r23, 1; mul.wide.s32 %rd17, %r24, 4; add.s64 %rd18, %rd3, %rd17; ld.global.nc.f32 %f1, [%rd18]; st.global.f32 [%rd4], %f1; ld.global.nc.f32 %f2, [%rd18+4]; st.global.f32 [%rd4+4], %f2; add.s64 %rd19, %rd2, %rd17; ld.global.nc.f32 %f3, [%rd19]; st.global.f32 [%rd5], %f3; ld.global.nc.f32 %f4, [%rd19+4]; st.global.f32 [%rd5+4], %f4; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f5, [%rd20]; st.global.f32 [%rd6], %f5; ld.global.nc.f32 %f6, [%rd20+4]; st.global.f32 [%rd6+4], %f6; $L__BB0_4: ret; } ` ) 3-3.11.1/cuda/hopfindex-solidangle-fourier-scale.cu000066400000000000000000000013471503346766200220660ustar00rootroot00000000000000// Rescale the effective field to a unit system where the cell spacing = 1 extern "C" __global__ void scaleemergentfield(float* __restrict__ Fx_scale, float* __restrict__ Fy_scale, float* __restrict__ Fz_scale, float* __restrict__ Fx, float* __restrict__ Fy, float* __restrict__ Fz, float cx, float cy, float cz, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix>= Nx || iy>= Ny || iz>=Nz) { return; } int I = (iz*Ny + iy)*Nx + ix; Fx_scale[I] = Fx[I] * cy * cz; Fy_scale[I] = Fy[I] * cx * cz; Fz_scale[I] = Fz[I] * cx * cy; } 3-3.11.1/cuda/hopfindex-solidangle-fourier-scale_wrapper.go000066400000000000000000001204561503346766200236270ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for scaleemergentfield kernel var scaleemergentfield_code cu.Function // Stores the arguments for scaleemergentfield kernel invocation type scaleemergentfield_args_t struct { arg_Fx_scale unsafe.Pointer arg_Fy_scale unsafe.Pointer arg_Fz_scale unsafe.Pointer arg_Fx unsafe.Pointer arg_Fy unsafe.Pointer arg_Fz unsafe.Pointer arg_cx float32 arg_cy float32 arg_cz float32 arg_Nx int arg_Ny int arg_Nz int argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for scaleemergentfield kernel invocation var scaleemergentfield_args scaleemergentfield_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. scaleemergentfield_args.argptr[0] = unsafe.Pointer(&scaleemergentfield_args.arg_Fx_scale) scaleemergentfield_args.argptr[1] = unsafe.Pointer(&scaleemergentfield_args.arg_Fy_scale) scaleemergentfield_args.argptr[2] = unsafe.Pointer(&scaleemergentfield_args.arg_Fz_scale) scaleemergentfield_args.argptr[3] = unsafe.Pointer(&scaleemergentfield_args.arg_Fx) scaleemergentfield_args.argptr[4] = unsafe.Pointer(&scaleemergentfield_args.arg_Fy) scaleemergentfield_args.argptr[5] = unsafe.Pointer(&scaleemergentfield_args.arg_Fz) scaleemergentfield_args.argptr[6] = unsafe.Pointer(&scaleemergentfield_args.arg_cx) scaleemergentfield_args.argptr[7] = unsafe.Pointer(&scaleemergentfield_args.arg_cy) scaleemergentfield_args.argptr[8] = unsafe.Pointer(&scaleemergentfield_args.arg_cz) scaleemergentfield_args.argptr[9] = unsafe.Pointer(&scaleemergentfield_args.arg_Nx) scaleemergentfield_args.argptr[10] = unsafe.Pointer(&scaleemergentfield_args.arg_Ny) scaleemergentfield_args.argptr[11] = unsafe.Pointer(&scaleemergentfield_args.arg_Nz) } // Wrapper for scaleemergentfield CUDA kernel, asynchronous. func k_scaleemergentfield_async(Fx_scale unsafe.Pointer, Fy_scale unsafe.Pointer, Fz_scale unsafe.Pointer, Fx unsafe.Pointer, Fy unsafe.Pointer, Fz unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("scaleemergentfield") } scaleemergentfield_args.Lock() defer scaleemergentfield_args.Unlock() if scaleemergentfield_code == 0 { scaleemergentfield_code = fatbinLoad(scaleemergentfield_map, "scaleemergentfield") } scaleemergentfield_args.arg_Fx_scale = Fx_scale scaleemergentfield_args.arg_Fy_scale = Fy_scale scaleemergentfield_args.arg_Fz_scale = Fz_scale scaleemergentfield_args.arg_Fx = Fx scaleemergentfield_args.arg_Fy = Fy scaleemergentfield_args.arg_Fz = Fz scaleemergentfield_args.arg_cx = cx scaleemergentfield_args.arg_cy = cy scaleemergentfield_args.arg_cz = cz scaleemergentfield_args.arg_Nx = Nx scaleemergentfield_args.arg_Ny = Ny scaleemergentfield_args.arg_Nz = Nz args := scaleemergentfield_args.argptr[:] cu.LaunchKernel(scaleemergentfield_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("scaleemergentfield") } } // maps compute capability on PTX code for scaleemergentfield kernel. var scaleemergentfield_map = map[int]string{0: "", 50: scaleemergentfield_ptx_50, 52: scaleemergentfield_ptx_52, 53: scaleemergentfield_ptx_53, 60: scaleemergentfield_ptx_60, 61: scaleemergentfield_ptx_61, 62: scaleemergentfield_ptx_62, 70: scaleemergentfield_ptx_70, 72: scaleemergentfield_ptx_72, 75: scaleemergentfield_ptx_75, 80: scaleemergentfield_ptx_80, 86: scaleemergentfield_ptx_86, 87: scaleemergentfield_ptx_87, 89: scaleemergentfield_ptx_89, 90: scaleemergentfield_ptx_90} // scaleemergentfield PTX code for various compute capabilities. const ( scaleemergentfield_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` scaleemergentfield_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl scaleemergentfield .visible .entry scaleemergentfield( .param .u64 scaleemergentfield_param_0, .param .u64 scaleemergentfield_param_1, .param .u64 scaleemergentfield_param_2, .param .u64 scaleemergentfield_param_3, .param .u64 scaleemergentfield_param_4, .param .u64 scaleemergentfield_param_5, .param .f32 scaleemergentfield_param_6, .param .f32 scaleemergentfield_param_7, .param .f32 scaleemergentfield_param_8, .param .u32 scaleemergentfield_param_9, .param .u32 scaleemergentfield_param_10, .param .u32 scaleemergentfield_param_11 ) { .reg .pred %p<6>; .reg .f32 %f<13>; .reg .b32 %r<18>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [scaleemergentfield_param_0]; ld.param.u64 %rd2, [scaleemergentfield_param_1]; ld.param.u64 %rd3, [scaleemergentfield_param_2]; ld.param.u64 %rd4, [scaleemergentfield_param_3]; ld.param.u64 %rd5, [scaleemergentfield_param_4]; ld.param.u64 %rd6, [scaleemergentfield_param_5]; ld.param.f32 %f1, [scaleemergentfield_param_6]; ld.param.f32 %f2, [scaleemergentfield_param_7]; ld.param.f32 %f3, [scaleemergentfield_param_8]; ld.param.u32 %r4, [scaleemergentfield_param_9]; ld.param.u32 %r5, [scaleemergentfield_param_10]; ld.param.u32 %r6, [scaleemergentfield_param_11]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd8, %r17, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; mul.f32 %f5, %f4, %f2; mul.f32 %f6, %f5, %f3; cvta.to.global.u64 %rd10, %rd1; add.s64 %rd11, %rd10, %rd8; st.global.f32 [%rd11], %f6; cvta.to.global.u64 %rd12, %rd5; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f7, [%rd13]; mul.f32 %f8, %f7, %f1; mul.f32 %f9, %f8, %f3; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd8; st.global.f32 [%rd15], %f9; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f10, [%rd17]; mul.f32 %f11, %f10, %f1; mul.f32 %f12, %f11, %f2; cvta.to.global.u64 %rd18, %rd3; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f12; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/hopfindex-solidangle-fourier-summand.cu000066400000000000000000000052141503346766200224400ustar00rootroot00000000000000#include // Calculates the summand F(-k) · [k × F(k)] / k^2 extern "C" __global__ void solidanglefouriersummand(float* __restrict__ summand_array, float* __restrict__ FkX_array, float* __restrict__ FkY_array, float* __restrict__ FkZ_array, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix>= Nx || iy>= Ny || iz>=Nz) { return; } float kx = static_cast(ix) / Nx; float ky = static_cast(iy) / Ny; float kz = static_cast(iz) / Nz; // Account for positive and negative frequencies (k-space values are in the range [-1/2, 1/2]) if (ix >= Nx/2) kx -= 1.0f; if (iy >= Ny/2) ky -= 1.0f; if (iz >= Nz/2) kz -= 1.0f; float k2 = kx*kx + ky*ky + kz*kz; int I = (iz*Ny + iy)*Nx + ix; int e = 2 * I; // Avoid division by zero at kx = ky = kz = 0 if (k2 == 0.0f) { summand_array[I] = 0.0f; } else { float reFkX = FkX_array[e ]; float reFkY = FkY_array[e ]; float reFkZ = FkZ_array[e ]; float imFkX = FkX_array[e+1]; float imFkY = FkY_array[e+1]; float imFkZ = FkZ_array[e+1]; float imFmkX = -FkX_array[e+1]; float imFmkY = -FkY_array[e+1]; float imFmkZ = -FkZ_array[e+1]; cuDoubleComplex FkX = make_cuDoubleComplex(reFkX, imFkX); cuDoubleComplex FkY = make_cuDoubleComplex(reFkY, imFkY); cuDoubleComplex FkZ = make_cuDoubleComplex(reFkZ, imFkZ); cuDoubleComplex FmkX = make_cuDoubleComplex(reFkX, imFmkX); cuDoubleComplex FmkY = make_cuDoubleComplex(reFkY, imFmkY); cuDoubleComplex FmkZ = make_cuDoubleComplex(reFkZ, imFmkZ); cuDoubleComplex kx_comp = make_cuDoubleComplex(kx, 0.0f); cuDoubleComplex ky_comp = make_cuDoubleComplex(ky, 0.0f); cuDoubleComplex kz_comp = make_cuDoubleComplex(kz, 0.0f); // Calculate F(-k) x (k · F(k)) / k^2 float summand = cuCimag( cuCadd( cuCadd( cuCmul(FmkX, cuCsub(cuCmul(ky_comp, FkZ), cuCmul(kz_comp, FkY))), cuCmul(FmkY, cuCsub(cuCmul(kz_comp, FkX), cuCmul(kx_comp, FkZ))) ), cuCmul(FmkZ, cuCsub(cuCmul(kx_comp, FkY), cuCmul(ky_comp, FkX))) ) ); summand /= k2; summand_array[I] = summand; } } 3-3.11.1/cuda/hopfindex-solidangle-fourier-summand_wrapper.go000066400000000000000000002172471503346766200242110ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for solidanglefouriersummand kernel var solidanglefouriersummand_code cu.Function // Stores the arguments for solidanglefouriersummand kernel invocation type solidanglefouriersummand_args_t struct { arg_summand_array unsafe.Pointer arg_FkX_array unsafe.Pointer arg_FkY_array unsafe.Pointer arg_FkZ_array unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for solidanglefouriersummand kernel invocation var solidanglefouriersummand_args solidanglefouriersummand_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. solidanglefouriersummand_args.argptr[0] = unsafe.Pointer(&solidanglefouriersummand_args.arg_summand_array) solidanglefouriersummand_args.argptr[1] = unsafe.Pointer(&solidanglefouriersummand_args.arg_FkX_array) solidanglefouriersummand_args.argptr[2] = unsafe.Pointer(&solidanglefouriersummand_args.arg_FkY_array) solidanglefouriersummand_args.argptr[3] = unsafe.Pointer(&solidanglefouriersummand_args.arg_FkZ_array) solidanglefouriersummand_args.argptr[4] = unsafe.Pointer(&solidanglefouriersummand_args.arg_Nx) solidanglefouriersummand_args.argptr[5] = unsafe.Pointer(&solidanglefouriersummand_args.arg_Ny) solidanglefouriersummand_args.argptr[6] = unsafe.Pointer(&solidanglefouriersummand_args.arg_Nz) } // Wrapper for solidanglefouriersummand CUDA kernel, asynchronous. func k_solidanglefouriersummand_async(summand_array unsafe.Pointer, FkX_array unsafe.Pointer, FkY_array unsafe.Pointer, FkZ_array unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("solidanglefouriersummand") } solidanglefouriersummand_args.Lock() defer solidanglefouriersummand_args.Unlock() if solidanglefouriersummand_code == 0 { solidanglefouriersummand_code = fatbinLoad(solidanglefouriersummand_map, "solidanglefouriersummand") } solidanglefouriersummand_args.arg_summand_array = summand_array solidanglefouriersummand_args.arg_FkX_array = FkX_array solidanglefouriersummand_args.arg_FkY_array = FkY_array solidanglefouriersummand_args.arg_FkZ_array = FkZ_array solidanglefouriersummand_args.arg_Nx = Nx solidanglefouriersummand_args.arg_Ny = Ny solidanglefouriersummand_args.arg_Nz = Nz args := solidanglefouriersummand_args.argptr[:] cu.LaunchKernel(solidanglefouriersummand_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("solidanglefouriersummand") } } // maps compute capability on PTX code for solidanglefouriersummand kernel. var solidanglefouriersummand_map = map[int]string{0: "", 50: solidanglefouriersummand_ptx_50, 52: solidanglefouriersummand_ptx_52, 53: solidanglefouriersummand_ptx_53, 60: solidanglefouriersummand_ptx_60, 61: solidanglefouriersummand_ptx_61, 62: solidanglefouriersummand_ptx_62, 70: solidanglefouriersummand_ptx_70, 72: solidanglefouriersummand_ptx_72, 75: solidanglefouriersummand_ptx_75, 80: solidanglefouriersummand_ptx_80, 86: solidanglefouriersummand_ptx_86, 87: solidanglefouriersummand_ptx_87, 89: solidanglefouriersummand_ptx_89, 90: solidanglefouriersummand_ptx_90} // solidanglefouriersummand PTX code for various compute capabilities. const ( solidanglefouriersummand_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` solidanglefouriersummand_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl solidanglefouriersummand .visible .entry solidanglefouriersummand( .param .u64 solidanglefouriersummand_param_0, .param .u64 solidanglefouriersummand_param_1, .param .u64 solidanglefouriersummand_param_2, .param .u64 solidanglefouriersummand_param_3, .param .u32 solidanglefouriersummand_param_4, .param .u32 solidanglefouriersummand_param_5, .param .u32 solidanglefouriersummand_param_6 ) { .reg .pred %p<10>; .reg .f32 %f<27>; .reg .b32 %r<29>; .reg .f64 %fd<54>; .reg .b64 %rd<15>; ld.param.u64 %rd2, [solidanglefouriersummand_param_0]; ld.param.u64 %rd3, [solidanglefouriersummand_param_1]; ld.param.u64 %rd4, [solidanglefouriersummand_param_2]; ld.param.u64 %rd5, [solidanglefouriersummand_param_3]; ld.param.u32 %r5, [solidanglefouriersummand_param_4]; ld.param.u32 %r6, [solidanglefouriersummand_param_5]; ld.param.u32 %r7, [solidanglefouriersummand_param_6]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; mad.lo.s32 %r1, %r9, %r8, %r10; mov.u32 %r11, %ntid.y; mov.u32 %r12, %ctaid.y; mov.u32 %r13, %tid.y; mad.lo.s32 %r2, %r12, %r11, %r13; mov.u32 %r14, %ntid.z; mov.u32 %r15, %ctaid.z; mov.u32 %r16, %tid.z; mad.lo.s32 %r3, %r15, %r14, %r16; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; cvt.rn.f32.s32 %f5, %r1; cvt.rn.f32.s32 %f6, %r5; div.rn.f32 %f7, %f5, %f6; cvt.rn.f32.s32 %f8, %r6; cvt.rn.f32.s32 %f9, %r2; div.rn.f32 %f10, %f9, %f8; cvt.rn.f32.s32 %f11, %r7; cvt.rn.f32.s32 %f12, %r3; div.rn.f32 %f13, %f12, %f11; shr.u32 %r17, %r5, 31; add.s32 %r18, %r5, %r17; shr.s32 %r19, %r18, 1; setp.lt.s32 %p6, %r1, %r19; add.f32 %f14, %f7, 0fBF800000; selp.f32 %f1, %f7, %f14, %p6; shr.u32 %r20, %r6, 31; add.s32 %r21, %r6, %r20; shr.s32 %r22, %r21, 1; setp.lt.s32 %p7, %r2, %r22; add.f32 %f15, %f10, 0fBF800000; selp.f32 %f2, %f10, %f15, %p7; shr.u32 %r23, %r7, 31; add.s32 %r24, %r7, %r23; shr.s32 %r25, %r24, 1; setp.lt.s32 %p8, %r3, %r25; add.f32 %f16, %f13, 0fBF800000; selp.f32 %f3, %f13, %f16, %p8; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; fma.rn.f32 %f4, %f3, %f3, %f18; mad.lo.s32 %r26, %r3, %r6, %r2; mad.lo.s32 %r4, %r26, %r5, %r1; setp.eq.f32 %p9, %f4, 0f00000000; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd6, %rd7; @%p9 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: mov.u32 %r28, 0; st.global.u32 [%rd1], %r28; bra.uni $L__BB0_4; $L__BB0_2: shl.b32 %r27, %r4, 1; cvta.to.global.u64 %rd8, %rd3; mul.wide.s32 %rd9, %r27, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f19, [%rd10+4]; ld.global.nc.f32 %f20, [%rd12+4]; ld.global.nc.f32 %f21, [%rd14+4]; ld.global.nc.f32 %f22, [%rd10]; cvt.f64.f32 %fd1, %f22; cvt.f64.f32 %fd2, %f19; ld.global.nc.f32 %f23, [%rd12]; cvt.f64.f32 %fd3, %f23; cvt.f64.f32 %fd4, %f20; ld.global.nc.f32 %f24, [%rd14]; cvt.f64.f32 %fd5, %f24; cvt.f64.f32 %fd6, %f21; cvt.f64.f32 %fd7, %f2; mul.f64 %fd8, %fd7, %fd5; mul.f64 %fd9, %fd6, 0d0000000000000000; sub.f64 %fd10, %fd8, %fd9; mul.f64 %fd11, %fd7, %fd6; fma.rn.f64 %fd12, %fd5, 0d0000000000000000, %fd11; cvt.f64.f32 %fd13, %f3; mul.f64 %fd14, %fd13, %fd3; mul.f64 %fd15, %fd4, 0d0000000000000000; sub.f64 %fd16, %fd14, %fd15; mul.f64 %fd17, %fd13, %fd4; fma.rn.f64 %fd18, %fd3, 0d0000000000000000, %fd17; sub.f64 %fd19, %fd10, %fd16; sub.f64 %fd20, %fd12, %fd18; mul.f64 %fd21, %fd20, %fd1; mul.f64 %fd22, %fd19, %fd2; sub.f64 %fd23, %fd21, %fd22; mul.f64 %fd24, %fd13, %fd1; mul.f64 %fd25, %fd2, 0d0000000000000000; sub.f64 %fd26, %fd24, %fd25; mul.f64 %fd27, %fd13, %fd2; fma.rn.f64 %fd28, %fd1, 0d0000000000000000, %fd27; cvt.f64.f32 %fd29, %f1; mul.f64 %fd30, %fd29, %fd5; sub.f64 %fd31, %fd30, %fd9; mul.f64 %fd32, %fd29, %fd6; fma.rn.f64 %fd33, %fd5, 0d0000000000000000, %fd32; sub.f64 %fd34, %fd26, %fd31; sub.f64 %fd35, %fd28, %fd33; mul.f64 %fd36, %fd35, %fd3; mul.f64 %fd37, %fd34, %fd4; sub.f64 %fd38, %fd36, %fd37; add.f64 %fd39, %fd23, %fd38; mul.f64 %fd40, %fd29, %fd3; sub.f64 %fd41, %fd40, %fd15; mul.f64 %fd42, %fd29, %fd4; fma.rn.f64 %fd43, %fd3, 0d0000000000000000, %fd42; mul.f64 %fd44, %fd7, %fd1; sub.f64 %fd45, %fd44, %fd25; mul.f64 %fd46, %fd7, %fd2; fma.rn.f64 %fd47, %fd1, 0d0000000000000000, %fd46; sub.f64 %fd48, %fd41, %fd45; sub.f64 %fd49, %fd43, %fd47; mul.f64 %fd50, %fd49, %fd5; mul.f64 %fd51, %fd48, %fd6; sub.f64 %fd52, %fd50, %fd51; add.f64 %fd53, %fd52, %fd39; cvt.rn.f32.f64 %f25, %fd53; div.rn.f32 %f26, %f25, %f4; st.global.f32 [%rd1], %f26; $L__BB0_4: ret; } ` ) 3-3.11.1/cuda/hopfindex-solidangle-fourier.go000066400000000000000000000040321503346766200207710ustar00rootroot00000000000000package cuda import ( "math" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func GetHopfIndex_SolidAngleFourier(m *data.Slice, mesh *data.Mesh) float64 { // Get emergent magnetic field in real space using Berg-Lüscher lattice method N := m.Size() util.Argument(m.Size() == N) F := NewSlice(3, N) defer F.Free() SetEmergentMagneticField_SolidAngle(F, m, mesh) // Rescale field to dimensionless length units by multiplying by cell dimensions cellsize := mesh.CellSize() cfg := make3DConf(N) k_scaleemergentfield_async(F.DevPtr(X), F.DevPtr(Y), F.DevPtr(Z), F.DevPtr(X), F.DevPtr(Y), F.DevPtr(Z), float32(cellsize[X]), float32(cellsize[Y]), float32(cellsize[Z]), N[X], N[Y], N[X], cfg) // Initialise FFT plan fftPlan := newFFT3DR2C(N[X], N[Y], N[Z]) // Declare buffers to store FFT Nc := fftR2COutputSizeFloats(N) fftRBufX := NewSlice(1, N) fftCBufX := NewSlice(1, Nc) fftRBufY := NewSlice(1, N) fftCBufY := NewSlice(1, Nc) fftRBufZ := NewSlice(1, N) fftCBufZ := NewSlice(1, Nc) defer fftRBufX.Free() defer fftCBufX.Free() defer fftRBufY.Free() defer fftCBufY.Free() defer fftRBufZ.Free() defer fftCBufZ.Free() fftRBufX = F.Comp(X) fftRBufY = F.Comp(Y) fftRBufZ = F.Comp(Z) // Perform FFT on each component fftPlan.ExecAsync(fftRBufX, fftCBufX) fftPlan.ExecAsync(fftRBufY, fftCBufY) fftPlan.ExecAsync(fftRBufZ, fftCBufZ) // Reconstruct full array using Hermitian symmetry F(-k_x, -k_y, -k_z) = F(k_x, k_y, k_z)^* full_array_N := [3]int{2 * N[X], N[Y], N[Z]} // 2 as real + complex part Fx_k := NewSlice(1, full_array_N) Fy_k := NewSlice(1, full_array_N) Fz_k := NewSlice(1, full_array_N) k_solidanglefourierfield_async(fftCBufX.DevPtr(0), fftCBufY.DevPtr(0), fftCBufZ.DevPtr(0), Fx_k.DevPtr(0), Fy_k.DevPtr(0), Fz_k.DevPtr(0), N[X], N[Y], N[Z], cfg) summand := NewSlice(1, N) k_solidanglefouriersummand_async(summand.DevPtr(0), Fx_k.DevPtr(0), Fy_k.DevPtr(0), Fz_k.DevPtr(0), N[X], N[Y], N[Z], cfg) return (1. / (2 * math.Pi * float64(N[X]) * float64(N[Y]) * float64(N[Z]))) * float64(Sum(summand)) } 3-3.11.1/cuda/hopfindex-solidangle.go000066400000000000000000000023111503346766200173160ustar00rootroot00000000000000package cuda import ( "math" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func SetHopfIndexDensity_SolidAngle(h, m *data.Slice, mesh *data.Mesh) { N := m.Size() // Create buffers to store emergent field F and vector potential A F := Buffer(3, N) defer Recycle(F) A := Buffer(3, N) defer Recycle(A) // Get Hopf index density F · A SetEmergentMagneticField_SolidAngle(F, m, mesh) SetVectorPotential(A, F, mesh) AddDotProduct(h, 1.0, F, A) } // Sets the emergent magnetic field F_i = (1/8π) ε_{ijk} m · (∂m/∂x_j × ∂m/∂x_k) using the Berg-Lüscher lattice method // See hopf-emergentmagneticfield-solidangle.cu func SetEmergentMagneticField_SolidAngle(F, m *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := F.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) icycz := float32(1.0 / (cellsize[Y] * cellsize[Z])) iczcx := float32(1.0 / (cellsize[Z] * cellsize[X])) icxcy := float32(1.0 / (cellsize[X] * cellsize[Y])) prefactor := float32(1.0 / (8 * math.Pi)) k_setemergentmagneticfieldsolidangle_async(F.DevPtr(X), F.DevPtr(Y), F.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), prefactor, icycz, iczcx, icxcy, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } 3-3.11.1/cuda/hopfindex-two-point.go000066400000000000000000000022441503346766200171420ustar00rootroot00000000000000package cuda import ( "math" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func SetHopfIndexDensity_TwoPointStencil(h, m *data.Slice, mesh *data.Mesh) { N := m.Size() // Create buffers to store emergent field F and vector potential A F := Buffer(3, N) defer Recycle(F) A := Buffer(3, N) defer Recycle(A) // Get Hopf index density F · A SetEmergentMagneticField_TwoPointStencil(F, m, mesh) SetVectorPotential(A, F, mesh) AddDotProduct(h, 1.0, F, A) } // Sets the emergent magnetic field F_i = (1/8π) ε_{ijk} m · (∂m/∂x_j × ∂m/∂x_k) // See hopf-emergentmagneticfield.cu func SetEmergentMagneticField_TwoPointStencil(F, m *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := F.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) icycz := float32(1.0 / (cellsize[Y] * cellsize[Z])) iczcx := float32(1.0 / (cellsize[Z] * cellsize[X])) icxcy := float32(1.0 / (cellsize[X] * cellsize[Y])) prefactor := float32(1.0 / (8 * math.Pi)) k_setemergentmagneticfieldtwopoint_async(F.DevPtr(X), F.DevPtr(Y), F.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), prefactor, icycz, iczcx, icxcy, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } 3-3.11.1/cuda/init.go000066400000000000000000000034621503346766200141660ustar00rootroot00000000000000// Package cuda provides GPU interaction package cuda import ( "fmt" "log" "runtime" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/util" ) var ( DriverVersion int // cuda driver version DevName string // GPU name TotalMem int64 // total GPU memory GPUInfo string // Human-readable GPU description Synchronous bool // for debug: synchronize stream0 at every kernel launch cudaCtx cu.Context // global CUDA context cudaCC int // compute capablity (used for fatbin) ) // Locks to an OS thread and initializes CUDA for that thread. func Init(gpu int) { if cudaCtx != 0 { return // needed for tests } runtime.LockOSThread() tryCuInit() dev := cu.Device(gpu) cudaCtx = cu.CtxCreate(cu.CTX_SCHED_YIELD, dev) cudaCtx.SetCurrent() M, m := dev.ComputeCapability() cudaCC = 10*M + m DriverVersion = cu.Version() DevName = dev.Name() TotalMem = dev.TotalMem() GPUInfo = fmt.Sprintf("%s(%dMB), CUDA Driver %d.%d, cc=%d.%d", DevName, (TotalMem)/(1024*1024), DriverVersion/1000, (DriverVersion%1000)/10, M, m) if M < 2 { log.Fatalln("GPU has insufficient compute capability, need 2.0 or higher.") } if Synchronous { log.Println("DEBUG: synchronized CUDA calls") } // test PTX load so that we can catch CUDA_ERROR_NO_BINARY_FOR_GPU early fatbinLoad(madd2_map, "madd2") } // cu.Init(), but error is fatal and does not dump stack. func tryCuInit() { defer func() { err := recover() if err == cu.ERROR_UNKNOWN { log.Println("\n Try running: sudo nvidia-modprobe -u \n") } util.FatalErr(err) }() cu.Init(0) } // Global stream used for everything const stream0 = cu.Stream(0) // Synchronize the global stream // This is called before and after all memcopy operations between host and device. func Sync() { stream0.Synchronize() } 3-3.11.1/cuda/init_test.go000066400000000000000000000002651503346766200152230ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/cuda/cu" ) // needed for all other tests. func init() { cu.Init(0) ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0) cu.CtxSetCurrent(ctx) } 3-3.11.1/cuda/kernmulc.cu000066400000000000000000000007631503346766200150460ustar00rootroot00000000000000extern "C" __global__ void kernmulC(float* __restrict__ fftM, float* __restrict__ fftK, int Nx, int Ny) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix>= Nx || iy>=Ny) { return; } int I = iy*Nx + ix; int e = 2 * I; float reM = fftM[e ]; float imM = fftM[e+1]; float reK = fftK[e ]; float imK = fftK[e+1]; fftM[e ] = reM * reK - imM * imK; fftM[e+1] = reM * imK + imM * reK; } 3-3.11.1/cuda/kernmulc_wrapper.go000066400000000000000000000510751503346766200166060ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulC kernel var kernmulC_code cu.Function // Stores the arguments for kernmulC kernel invocation type kernmulC_args_t struct { arg_fftM unsafe.Pointer arg_fftK unsafe.Pointer arg_Nx int arg_Ny int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulC kernel invocation var kernmulC_args kernmulC_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulC_args.argptr[0] = unsafe.Pointer(&kernmulC_args.arg_fftM) kernmulC_args.argptr[1] = unsafe.Pointer(&kernmulC_args.arg_fftK) kernmulC_args.argptr[2] = unsafe.Pointer(&kernmulC_args.arg_Nx) kernmulC_args.argptr[3] = unsafe.Pointer(&kernmulC_args.arg_Ny) } // Wrapper for kernmulC CUDA kernel, asynchronous. func k_kernmulC_async(fftM unsafe.Pointer, fftK unsafe.Pointer, Nx int, Ny int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulC") } kernmulC_args.Lock() defer kernmulC_args.Unlock() if kernmulC_code == 0 { kernmulC_code = fatbinLoad(kernmulC_map, "kernmulC") } kernmulC_args.arg_fftM = fftM kernmulC_args.arg_fftK = fftK kernmulC_args.arg_Nx = Nx kernmulC_args.arg_Ny = Ny args := kernmulC_args.argptr[:] cu.LaunchKernel(kernmulC_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulC") } } // maps compute capability on PTX code for kernmulC kernel. var kernmulC_map = map[int]string{0: "", 50: kernmulC_ptx_50, 52: kernmulC_ptx_52, 53: kernmulC_ptx_53, 60: kernmulC_ptx_60, 61: kernmulC_ptx_61, 62: kernmulC_ptx_62, 70: kernmulC_ptx_70, 72: kernmulC_ptx_72, 75: kernmulC_ptx_75, 80: kernmulC_ptx_80, 86: kernmulC_ptx_86, 87: kernmulC_ptx_87, 89: kernmulC_ptx_89, 90: kernmulC_ptx_90} // kernmulC PTX code for various compute capabilities. const ( kernmulC_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` kernmulC_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r12, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd5]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd5+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd5], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd5+4], %f9; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/kernmulrsymm2dxy.cu000066400000000000000000000021271503346766200165760ustar00rootroot00000000000000// 2D XY (in-plane) micromagnetic kernel multiplication: // |Mx| = |Kxx Kxy| * |Mx| // |My| |Kyx Kyy| |My| // Using the same symmetries as kernmulrsymm3d.cu extern "C" __global__ void kernmulRSymm2Dxy(float* __restrict__ fftMx, float* __restrict__ fftMy, float* __restrict__ fftKxx, float* __restrict__ fftKyy, float* __restrict__ fftKxy, int Nx, int Ny) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix>= Nx || iy>=Ny) { return; } int I = iy*Nx + ix; int e = 2 * I; float reMx = fftMx[e ]; float imMx = fftMx[e+1]; float reMy = fftMy[e ]; float imMy = fftMy[e+1]; // symmetry factor float fxy = 1.0f; if (iy > Ny/2) { iy = Ny-iy; fxy = -fxy; } I = iy*Nx + ix; float Kxx = fftKxx[I]; float Kyy = fftKyy[I]; float Kxy = fxy * fftKxy[I]; fftMx[e ] = reMx * Kxx + reMy * Kxy; fftMx[e+1] = imMx * Kxx + imMy * Kxy; fftMy[e ] = reMx * Kxy + reMy * Kyy; fftMy[e+1] = imMx * Kxy + imMy * Kyy; } 3-3.11.1/cuda/kernmulrsymm2dxy_wrapper.go000066400000000000000000001115171503346766200203400ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulRSymm2Dxy kernel var kernmulRSymm2Dxy_code cu.Function // Stores the arguments for kernmulRSymm2Dxy kernel invocation type kernmulRSymm2Dxy_args_t struct { arg_fftMx unsafe.Pointer arg_fftMy unsafe.Pointer arg_fftKxx unsafe.Pointer arg_fftKyy unsafe.Pointer arg_fftKxy unsafe.Pointer arg_Nx int arg_Ny int argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm2Dxy kernel invocation var kernmulRSymm2Dxy_args kernmulRSymm2Dxy_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulRSymm2Dxy_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMx) kernmulRSymm2Dxy_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMy) kernmulRSymm2Dxy_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxx) kernmulRSymm2Dxy_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKyy) kernmulRSymm2Dxy_args.argptr[4] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxy) kernmulRSymm2Dxy_args.argptr[5] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Nx) kernmulRSymm2Dxy_args.argptr[6] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Ny) } // Wrapper for kernmulRSymm2Dxy CUDA kernel, asynchronous. func k_kernmulRSymm2Dxy_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulRSymm2Dxy") } kernmulRSymm2Dxy_args.Lock() defer kernmulRSymm2Dxy_args.Unlock() if kernmulRSymm2Dxy_code == 0 { kernmulRSymm2Dxy_code = fatbinLoad(kernmulRSymm2Dxy_map, "kernmulRSymm2Dxy") } kernmulRSymm2Dxy_args.arg_fftMx = fftMx kernmulRSymm2Dxy_args.arg_fftMy = fftMy kernmulRSymm2Dxy_args.arg_fftKxx = fftKxx kernmulRSymm2Dxy_args.arg_fftKyy = fftKyy kernmulRSymm2Dxy_args.arg_fftKxy = fftKxy kernmulRSymm2Dxy_args.arg_Nx = Nx kernmulRSymm2Dxy_args.arg_Ny = Ny args := kernmulRSymm2Dxy_args.argptr[:] cu.LaunchKernel(kernmulRSymm2Dxy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulRSymm2Dxy") } } // maps compute capability on PTX code for kernmulRSymm2Dxy kernel. var kernmulRSymm2Dxy_map = map[int]string{0: "", 50: kernmulRSymm2Dxy_ptx_50, 52: kernmulRSymm2Dxy_ptx_52, 53: kernmulRSymm2Dxy_ptx_53, 60: kernmulRSymm2Dxy_ptx_60, 61: kernmulRSymm2Dxy_ptx_61, 62: kernmulRSymm2Dxy_ptx_62, 70: kernmulRSymm2Dxy_ptx_70, 72: kernmulRSymm2Dxy_ptx_72, 75: kernmulRSymm2Dxy_ptx_75, 80: kernmulRSymm2Dxy_ptx_80, 86: kernmulRSymm2Dxy_ptx_86, 87: kernmulRSymm2Dxy_ptx_87, 89: kernmulRSymm2Dxy_ptx_89, 90: kernmulRSymm2Dxy_ptx_90} // kernmulRSymm2Dxy PTX code for various compute capabilities. const ( kernmulRSymm2Dxy_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd7, %rd1; mul.wide.s32 %rd8, %r12, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9+4]; cvta.to.global.u64 %rd10, %rd2; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd6, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd9]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd9], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd9+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/kernmulrsymm2dz.cu000066400000000000000000000012331503346766200164040ustar00rootroot00000000000000// 2D Z (out-of-plane only) micromagnetic kernel multiplication: // Mz = Kzz * Mz // Using the same symmetries as kernmulrsymm3d.cu extern "C" __global__ void kernmulRSymm2Dz(float* __restrict__ fftMz, float* __restrict__ fftKzz, int Nx, int Ny) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix>= Nx || iy>=Ny) { return; } int I = iy*Nx + ix; int e = 2 * I; float reMz = fftMz[e ]; float imMz = fftMz[e+1]; if (iy > Ny/2) { iy = Ny-iy; } I = iy*Nx + ix; float Kzz = fftKzz[I]; fftMz[e ] = reMz * Kzz; fftMz[e+1] = imMz * Kzz; } 3-3.11.1/cuda/kernmulrsymm2dz_wrapper.go000066400000000000000000000570071503346766200201540ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulRSymm2Dz kernel var kernmulRSymm2Dz_code cu.Function // Stores the arguments for kernmulRSymm2Dz kernel invocation type kernmulRSymm2Dz_args_t struct { arg_fftMz unsafe.Pointer arg_fftKzz unsafe.Pointer arg_Nx int arg_Ny int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm2Dz kernel invocation var kernmulRSymm2Dz_args kernmulRSymm2Dz_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulRSymm2Dz_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftMz) kernmulRSymm2Dz_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftKzz) kernmulRSymm2Dz_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Nx) kernmulRSymm2Dz_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Ny) } // Wrapper for kernmulRSymm2Dz CUDA kernel, asynchronous. func k_kernmulRSymm2Dz_async(fftMz unsafe.Pointer, fftKzz unsafe.Pointer, Nx int, Ny int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulRSymm2Dz") } kernmulRSymm2Dz_args.Lock() defer kernmulRSymm2Dz_args.Unlock() if kernmulRSymm2Dz_code == 0 { kernmulRSymm2Dz_code = fatbinLoad(kernmulRSymm2Dz_map, "kernmulRSymm2Dz") } kernmulRSymm2Dz_args.arg_fftMz = fftMz kernmulRSymm2Dz_args.arg_fftKzz = fftKzz kernmulRSymm2Dz_args.arg_Nx = Nx kernmulRSymm2Dz_args.arg_Ny = Ny args := kernmulRSymm2Dz_args.argptr[:] cu.LaunchKernel(kernmulRSymm2Dz_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulRSymm2Dz") } } // maps compute capability on PTX code for kernmulRSymm2Dz kernel. var kernmulRSymm2Dz_map = map[int]string{0: "", 50: kernmulRSymm2Dz_ptx_50, 52: kernmulRSymm2Dz_ptx_52, 53: kernmulRSymm2Dz_ptx_53, 60: kernmulRSymm2Dz_ptx_60, 61: kernmulRSymm2Dz_ptx_61, 62: kernmulRSymm2Dz_ptx_62, 70: kernmulRSymm2Dz_ptx_70, 72: kernmulRSymm2Dz_ptx_72, 75: kernmulRSymm2Dz_ptx_75, 80: kernmulRSymm2Dz_ptx_80, 86: kernmulRSymm2Dz_ptx_86, 87: kernmulRSymm2Dz_ptx_87, 89: kernmulRSymm2Dz_ptx_89, 90: kernmulRSymm2Dz_ptx_90} // kernmulRSymm2Dz PTX code for various compute capabilities. const ( kernmulRSymm2Dz_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` kernmulRSymm2Dz_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r9, %r8, %r10; setp.ge.s32 %p1, %r1, %r3; setp.ge.s32 %p2, %r2, %r4; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd3, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/kernmulrsymm3d.cu000066400000000000000000000047021503346766200162170ustar00rootroot00000000000000// 3D micromagnetic kernel multiplication: // // |Mx| |Kxx Kxy Kxz| |Mx| // |My| = |Kxy Kyy Kyz| * |My| // |Mz| |Kxz Kyz Kzz| |Mz| // // ~kernel has mirror symmetry along Y and Z-axis, // apart from first row, // and is only stored (roughly) half: // // K11, K22, K02: // xxxxx // aaaaa // bbbbb // .... // bbbbb // aaaaa // // K12: // xxxxx // aaaaa // bbbbb // ... // -bbbb // -aaaa extern "C" __global__ void kernmulRSymm3D(float* __restrict__ fftMx, float* __restrict__ fftMy, float* __restrict__ fftMz, float* __restrict__ fftKxx, float* __restrict__ fftKyy, float* __restrict__ fftKzz, float* __restrict__ fftKyz, float* __restrict__ fftKxz, float* __restrict__ fftKxy, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix>= Nx || iy>= Ny || iz>=Nz) { return; } // fetch (complex) FFT'ed magnetization int I = (iz*Ny + iy)*Nx + ix; int e = 2 * I; float reMx = fftMx[e ]; float imMx = fftMx[e+1]; float reMy = fftMy[e ]; float imMy = fftMy[e+1]; float reMz = fftMz[e ]; float imMz = fftMz[e+1]; // fetch kernel // minus signs are added to some elements if // reconstructed from symmetry. float signYZ = 1.0f; float signXZ = 1.0f; float signXY = 1.0f; // use symmetry to fetch from redundant parts: // mirror index into first quadrant and set signs. if (iy > Ny/2) { iy = Ny-iy; signYZ = -signYZ; signXY = -signXY; } if (iz > Nz/2) { iz = Nz-iz; signYZ = -signYZ; signXZ = -signXZ; } // fetch kernel element from non-redundant part // and apply minus signs for mirrored parts. I = (iz*(Ny/2+1) + iy)*Nx + ix; // Ny/2+1: only half is stored float Kxx = fftKxx[I]; float Kyy = fftKyy[I]; float Kzz = fftKzz[I]; float Kyz = fftKyz[I] * signYZ; float Kxz = fftKxz[I] * signXZ; float Kxy = fftKxy[I] * signXY; // m * K matrix multiplication, overwrite m with result. fftMx[e ] = reMx * Kxx + reMy * Kxy + reMz * Kxz; fftMx[e+1] = imMx * Kxx + imMy * Kxy + imMz * Kxz; fftMy[e ] = reMx * Kxy + reMy * Kyy + reMz * Kyz; fftMy[e+1] = imMx * Kxy + imMy * Kyy + imMz * Kyz; fftMz[e ] = reMx * Kxz + reMy * Kyz + reMz * Kzz; fftMz[e+1] = imMx * Kxz + imMy * Kyz + imMz * Kzz; } 3-3.11.1/cuda/kernmulrsymm3d_wrapper.go000066400000000000000000001751201503346766200177600ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulRSymm3D kernel var kernmulRSymm3D_code cu.Function // Stores the arguments for kernmulRSymm3D kernel invocation type kernmulRSymm3D_args_t struct { arg_fftMx unsafe.Pointer arg_fftMy unsafe.Pointer arg_fftMz unsafe.Pointer arg_fftKxx unsafe.Pointer arg_fftKyy unsafe.Pointer arg_fftKzz unsafe.Pointer arg_fftKyz unsafe.Pointer arg_fftKxz unsafe.Pointer arg_fftKxy unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm3D kernel invocation var kernmulRSymm3D_args kernmulRSymm3D_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulRSymm3D_args.argptr[0] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMx) kernmulRSymm3D_args.argptr[1] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMy) kernmulRSymm3D_args.argptr[2] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMz) kernmulRSymm3D_args.argptr[3] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxx) kernmulRSymm3D_args.argptr[4] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyy) kernmulRSymm3D_args.argptr[5] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKzz) kernmulRSymm3D_args.argptr[6] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyz) kernmulRSymm3D_args.argptr[7] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxz) kernmulRSymm3D_args.argptr[8] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxy) kernmulRSymm3D_args.argptr[9] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nx) kernmulRSymm3D_args.argptr[10] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Ny) kernmulRSymm3D_args.argptr[11] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nz) } // Wrapper for kernmulRSymm3D CUDA kernel, asynchronous. func k_kernmulRSymm3D_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, fftKxz unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulRSymm3D") } kernmulRSymm3D_args.Lock() defer kernmulRSymm3D_args.Unlock() if kernmulRSymm3D_code == 0 { kernmulRSymm3D_code = fatbinLoad(kernmulRSymm3D_map, "kernmulRSymm3D") } kernmulRSymm3D_args.arg_fftMx = fftMx kernmulRSymm3D_args.arg_fftMy = fftMy kernmulRSymm3D_args.arg_fftMz = fftMz kernmulRSymm3D_args.arg_fftKxx = fftKxx kernmulRSymm3D_args.arg_fftKyy = fftKyy kernmulRSymm3D_args.arg_fftKzz = fftKzz kernmulRSymm3D_args.arg_fftKyz = fftKyz kernmulRSymm3D_args.arg_fftKxz = fftKxz kernmulRSymm3D_args.arg_fftKxy = fftKxy kernmulRSymm3D_args.arg_Nx = Nx kernmulRSymm3D_args.arg_Ny = Ny kernmulRSymm3D_args.arg_Nz = Nz args := kernmulRSymm3D_args.argptr[:] cu.LaunchKernel(kernmulRSymm3D_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulRSymm3D") } } // maps compute capability on PTX code for kernmulRSymm3D kernel. var kernmulRSymm3D_map = map[int]string{0: "", 50: kernmulRSymm3D_ptx_50, 52: kernmulRSymm3D_ptx_52, 53: kernmulRSymm3D_ptx_53, 60: kernmulRSymm3D_ptx_60, 61: kernmulRSymm3D_ptx_61, 62: kernmulRSymm3D_ptx_62, 70: kernmulRSymm3D_ptx_70, 72: kernmulRSymm3D_ptx_72, 75: kernmulRSymm3D_ptx_75, 80: kernmulRSymm3D_ptx_80, 86: kernmulRSymm3D_ptx_86, 87: kernmulRSymm3D_ptx_87, 89: kernmulRSymm3D_ptx_89, 90: kernmulRSymm3D_ptx_90} // kernmulRSymm3D PTX code for various compute capabilities. const ( kernmulRSymm3D_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` kernmulRSymm3D_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; cvta.to.global.u64 %rd11, %rd1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13+4]; cvta.to.global.u64 %rd14, %rd2; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15+4]; cvta.to.global.u64 %rd16, %rd3; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd10, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd13]; ld.global.f32 %f18, [%rd15]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd13], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd13+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd15], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd15+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/llnoprecess.cu000066400000000000000000000013301503346766200155460ustar00rootroot00000000000000#include #include "float3.h" // Landau-Lifshitz torque without precession extern "C" __global__ void llnoprecess(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ hx, float* __restrict__ hy, float* __restrict__ hz, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m = {mx[i], my[i], mz[i]}; float3 H = {hx[i], hy[i], hz[i]}; float3 mxH = cross(m, H); float3 torque = -cross(m, mxH); tx[i] = torque.x; ty[i] = torque.y; tz[i] = torque.z; } } 3-3.11.1/cuda/llnoprecess_wrapper.go000066400000000000000000001177301503346766200173200ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for llnoprecess kernel var llnoprecess_code cu.Function // Stores the arguments for llnoprecess kernel invocation type llnoprecess_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_hx unsafe.Pointer arg_hy unsafe.Pointer arg_hz unsafe.Pointer arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for llnoprecess kernel invocation var llnoprecess_args llnoprecess_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. llnoprecess_args.argptr[0] = unsafe.Pointer(&llnoprecess_args.arg_tx) llnoprecess_args.argptr[1] = unsafe.Pointer(&llnoprecess_args.arg_ty) llnoprecess_args.argptr[2] = unsafe.Pointer(&llnoprecess_args.arg_tz) llnoprecess_args.argptr[3] = unsafe.Pointer(&llnoprecess_args.arg_mx) llnoprecess_args.argptr[4] = unsafe.Pointer(&llnoprecess_args.arg_my) llnoprecess_args.argptr[5] = unsafe.Pointer(&llnoprecess_args.arg_mz) llnoprecess_args.argptr[6] = unsafe.Pointer(&llnoprecess_args.arg_hx) llnoprecess_args.argptr[7] = unsafe.Pointer(&llnoprecess_args.arg_hy) llnoprecess_args.argptr[8] = unsafe.Pointer(&llnoprecess_args.arg_hz) llnoprecess_args.argptr[9] = unsafe.Pointer(&llnoprecess_args.arg_N) } // Wrapper for llnoprecess CUDA kernel, asynchronous. func k_llnoprecess_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("llnoprecess") } llnoprecess_args.Lock() defer llnoprecess_args.Unlock() if llnoprecess_code == 0 { llnoprecess_code = fatbinLoad(llnoprecess_map, "llnoprecess") } llnoprecess_args.arg_tx = tx llnoprecess_args.arg_ty = ty llnoprecess_args.arg_tz = tz llnoprecess_args.arg_mx = mx llnoprecess_args.arg_my = my llnoprecess_args.arg_mz = mz llnoprecess_args.arg_hx = hx llnoprecess_args.arg_hy = hy llnoprecess_args.arg_hz = hz llnoprecess_args.arg_N = N args := llnoprecess_args.argptr[:] cu.LaunchKernel(llnoprecess_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("llnoprecess") } } // maps compute capability on PTX code for llnoprecess kernel. var llnoprecess_map = map[int]string{0: "", 50: llnoprecess_ptx_50, 52: llnoprecess_ptx_52, 53: llnoprecess_ptx_53, 60: llnoprecess_ptx_60, 61: llnoprecess_ptx_61, 62: llnoprecess_ptx_62, 70: llnoprecess_ptx_70, 72: llnoprecess_ptx_72, 75: llnoprecess_ptx_75, 80: llnoprecess_ptx_80, 86: llnoprecess_ptx_86, 87: llnoprecess_ptx_87, 89: llnoprecess_ptx_89, 90: llnoprecess_ptx_90} // llnoprecess PTX code for various compute capabilities. const ( llnoprecess_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` llnoprecess_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/lltorque.go000066400000000000000000000015311503346766200150650ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // Landau-Lifshitz torque divided by gamma0: // - 1/(1+α²) [ m x B + α m x (m x B) ] // torque in Tesla // m normalized // B in Tesla // // see lltorque.cu func LLTorque(torque, m, B *data.Slice, alpha MSlice) { N := torque.Len() cfg := make1DConf(N) k_lltorque2_async(torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), B.DevPtr(X), B.DevPtr(Y), B.DevPtr(Z), alpha.DevPtr(0), alpha.Mul(0), N, cfg) } // Landau-Lifshitz torque with precession disabled. // Used by engine.Relax(). func LLNoPrecess(torque, m, B *data.Slice) { N := torque.Len() cfg := make1DConf(N) k_llnoprecess_async(torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), B.DevPtr(X), B.DevPtr(Y), B.DevPtr(Z), N, cfg) } 3-3.11.1/cuda/lltorque2.cu000066400000000000000000000016071503346766200151550ustar00rootroot00000000000000#include "amul.h" #include "float3.h" #include // Landau-Lifshitz torque. extern "C" __global__ void lltorque2(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ hx, float* __restrict__ hy, float* __restrict__ hz, float* __restrict__ alpha_, float alpha_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m = {mx[i], my[i], mz[i]}; float3 H = {hx[i], hy[i], hz[i]}; float alpha = amul(alpha_, alpha_mul, i); float3 mxH = cross(m, H); float gilb = -1.0f / (1.0f + alpha * alpha); float3 torque = gilb * (mxH + alpha * cross(m, mxH)); tx[i] = torque.x; ty[i] = torque.y; tz[i] = torque.z; } } 3-3.11.1/cuda/lltorque2_wrapper.go000066400000000000000000001411471503346766200167170ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for lltorque2 kernel var lltorque2_code cu.Function // Stores the arguments for lltorque2 kernel invocation type lltorque2_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_hx unsafe.Pointer arg_hy unsafe.Pointer arg_hz unsafe.Pointer arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_N int argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for lltorque2 kernel invocation var lltorque2_args lltorque2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. lltorque2_args.argptr[0] = unsafe.Pointer(&lltorque2_args.arg_tx) lltorque2_args.argptr[1] = unsafe.Pointer(&lltorque2_args.arg_ty) lltorque2_args.argptr[2] = unsafe.Pointer(&lltorque2_args.arg_tz) lltorque2_args.argptr[3] = unsafe.Pointer(&lltorque2_args.arg_mx) lltorque2_args.argptr[4] = unsafe.Pointer(&lltorque2_args.arg_my) lltorque2_args.argptr[5] = unsafe.Pointer(&lltorque2_args.arg_mz) lltorque2_args.argptr[6] = unsafe.Pointer(&lltorque2_args.arg_hx) lltorque2_args.argptr[7] = unsafe.Pointer(&lltorque2_args.arg_hy) lltorque2_args.argptr[8] = unsafe.Pointer(&lltorque2_args.arg_hz) lltorque2_args.argptr[9] = unsafe.Pointer(&lltorque2_args.arg_alpha_) lltorque2_args.argptr[10] = unsafe.Pointer(&lltorque2_args.arg_alpha_mul) lltorque2_args.argptr[11] = unsafe.Pointer(&lltorque2_args.arg_N) } // Wrapper for lltorque2 CUDA kernel, asynchronous. func k_lltorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("lltorque2") } lltorque2_args.Lock() defer lltorque2_args.Unlock() if lltorque2_code == 0 { lltorque2_code = fatbinLoad(lltorque2_map, "lltorque2") } lltorque2_args.arg_tx = tx lltorque2_args.arg_ty = ty lltorque2_args.arg_tz = tz lltorque2_args.arg_mx = mx lltorque2_args.arg_my = my lltorque2_args.arg_mz = mz lltorque2_args.arg_hx = hx lltorque2_args.arg_hy = hy lltorque2_args.arg_hz = hz lltorque2_args.arg_alpha_ = alpha_ lltorque2_args.arg_alpha_mul = alpha_mul lltorque2_args.arg_N = N args := lltorque2_args.argptr[:] cu.LaunchKernel(lltorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("lltorque2") } } // maps compute capability on PTX code for lltorque2 kernel. var lltorque2_map = map[int]string{0: "", 50: lltorque2_ptx_50, 52: lltorque2_ptx_52, 53: lltorque2_ptx_53, 60: lltorque2_ptx_60, 61: lltorque2_ptx_61, 62: lltorque2_ptx_62, 70: lltorque2_ptx_70, 72: lltorque2_ptx_72, 75: lltorque2_ptx_75, 80: lltorque2_ptx_80, 86: lltorque2_ptx_86, 87: lltorque2_ptx_87, 89: lltorque2_ptx_89, 90: lltorque2_ptx_90} // lltorque2 PTX code for various compute capabilities. const ( lltorque2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` lltorque2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<35>; ld.param.u64 %rd2, [lltorque2_param_0]; ld.param.u64 %rd3, [lltorque2_param_1]; ld.param.u64 %rd4, [lltorque2_param_2]; ld.param.u64 %rd5, [lltorque2_param_3]; ld.param.u64 %rd6, [lltorque2_param_4]; ld.param.u64 %rd7, [lltorque2_param_5]; ld.param.u64 %rd8, [lltorque2_param_6]; ld.param.u64 %rd9, [lltorque2_param_7]; ld.param.u64 %rd10, [lltorque2_param_8]; ld.param.u64 %rd11, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd12, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f1, [%rd14]; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f2, [%rd16]; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd13; ld.global.nc.f32 %f3, [%rd18]; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd13; ld.global.nc.f32 %f4, [%rd20]; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd13; ld.global.nc.f32 %f5, [%rd22]; cvta.to.global.u64 %rd23, %rd10; add.s64 %rd24, %rd23, %rd13; ld.global.nc.f32 %f6, [%rd24]; setp.eq.s64 %p2, %rd11, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd11; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f10, [%rd27]; mul.f32 %f38, %f10, %f38; $L__BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f32, %f22; mul.f32 %f36, %f33, %f22; mul.f32 %f37, %f34, %f22; cvta.to.global.u64 %rd28, %rd2; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; st.global.f32 [%rd30], %f35; cvta.to.global.u64 %rd31, %rd3; add.s64 %rd32, %rd31, %rd29; st.global.f32 [%rd32], %f36; cvta.to.global.u64 %rd33, %rd4; add.s64 %rd34, %rd33, %rd29; st.global.f32 [%rd34], %f37; $L__BB0_4: ret; } ` ) 3-3.11.1/cuda/lut.go000066400000000000000000000005711503346766200140250ustar00rootroot00000000000000package cuda // Look-up tables holding per-region parameter values. // LUT[regions[cellindex]] gives parameter value for cell. import "unsafe" type LUTPtr unsafe.Pointer // points to 256 float32's type LUTPtrs []unsafe.Pointer // elements point to 256 float32's type SymmLUT unsafe.Pointer // points to 256x256 symmetric matrix, only lower half stored. See exchange.cu 3-3.11.1/cuda/madd.go000066400000000000000000000117431503346766200141310ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // multiply: dst[i] = a[i] * b[i] // a and b must have the same number of components func Mul(dst, a, b *data.Slice) { N := dst.Len() nComp := dst.NComp() util.Assert(a.Len() == N && a.NComp() == nComp && b.Len() == N && b.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_mul_async(dst.DevPtr(c), a.DevPtr(c), b.DevPtr(c), N, cfg) } } // divide: dst[i] = a[i] / b[i] // divide-by-zero yields zero. func Div(dst, a, b *data.Slice) { N := dst.Len() nComp := dst.NComp() util.Assert(a.Len() == N && a.NComp() == nComp && b.Len() == N && b.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_pointwise_div_async(dst.DevPtr(c), a.DevPtr(c), b.DevPtr(c), N, cfg) } } // Add: dst = src1 + src2. func Add(dst, src1, src2 *data.Slice) { Madd2(dst, src1, src2, 1, 1) } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 func Madd2(dst, src1, src2 *data.Slice, factor1, factor2 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd2_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 func Madd3(dst, src1, src2, src3 *data.Slice, factor1, factor2, factor3 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd3_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 func Madd4(dst, src1, src2, src3, src4 *data.Slice, factor1, factor2, factor3, factor4 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd4_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 func Madd5(dst, src1, src2, src3, src4, src5 *data.Slice, factor1, factor2, factor3, factor4, factor5 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd5_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, src5.DevPtr(c), factor5, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 + src6[i] * factor6 func Madd6(dst, src1, src2, src3, src4, src5, src6 *data.Slice, factor1, factor2, factor3, factor4, factor5, factor6 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N && src6.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp && src6.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd6_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, src5.DevPtr(c), factor5, src6.DevPtr(c), factor6, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 + src6[i] * factor6 + src7[i] * factor7 func Madd7(dst, src1, src2, src3, src4, src5, src6, src7 *data.Slice, factor1, factor2, factor3, factor4, factor5, factor6, factor7 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N && src6.Len() == N && src7.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp && src6.NComp() == nComp && src7.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd7_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, src5.DevPtr(c), factor5, src6.DevPtr(c), factor6, src7.DevPtr(c), factor7, N, cfg) } } 3-3.11.1/cuda/madd2.cu000066400000000000000000000005401503346766200142060ustar00rootroot00000000000000 // dst[i] = fac1*src1[i] + fac2*src2[i]; extern "C" __global__ void madd2(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = fac1*src1[i] + fac2*src2[i]; } } 3-3.11.1/cuda/madd2_wrapper.go000066400000000000000000000452711503346766200157560ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd2 kernel var madd2_code cu.Function // Stores the arguments for madd2 kernel invocation type madd2_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_N int argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for madd2 kernel invocation var madd2_args madd2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd2_args.argptr[0] = unsafe.Pointer(&madd2_args.arg_dst) madd2_args.argptr[1] = unsafe.Pointer(&madd2_args.arg_src1) madd2_args.argptr[2] = unsafe.Pointer(&madd2_args.arg_fac1) madd2_args.argptr[3] = unsafe.Pointer(&madd2_args.arg_src2) madd2_args.argptr[4] = unsafe.Pointer(&madd2_args.arg_fac2) madd2_args.argptr[5] = unsafe.Pointer(&madd2_args.arg_N) } // Wrapper for madd2 CUDA kernel, asynchronous. func k_madd2_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd2") } madd2_args.Lock() defer madd2_args.Unlock() if madd2_code == 0 { madd2_code = fatbinLoad(madd2_map, "madd2") } madd2_args.arg_dst = dst madd2_args.arg_src1 = src1 madd2_args.arg_fac1 = fac1 madd2_args.arg_src2 = src2 madd2_args.arg_fac2 = fac2 madd2_args.arg_N = N args := madd2_args.argptr[:] cu.LaunchKernel(madd2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd2") } } // maps compute capability on PTX code for madd2 kernel. var madd2_map = map[int]string{0: "", 50: madd2_ptx_50, 52: madd2_ptx_52, 53: madd2_ptx_53, 60: madd2_ptx_60, 61: madd2_ptx_61, 62: madd2_ptx_62, 70: madd2_ptx_70, 72: madd2_ptx_72, 75: madd2_ptx_75, 80: madd2_ptx_80, 86: madd2_ptx_86, 87: madd2_ptx_87, 89: madd2_ptx_89, 90: madd2_ptx_90} // madd2 PTX code for various compute capabilities. const ( madd2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` madd2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/madd3.cu000066400000000000000000000007511503346766200142130ustar00rootroot00000000000000 // dst[i] = fac1 * src1[i] + fac2 * src2[i] + fac3 * src3[i] extern "C" __global__ void madd3(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1 * src1[i]) + (fac2 * src2[i] + fac3 * src3[i]); // parens for better accuracy heun solver. } } 3-3.11.1/cuda/madd3_wrapper.go000066400000000000000000000550511503346766200157540ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd3 kernel var madd3_code cu.Function // Stores the arguments for madd3 kernel invocation type madd3_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_N int argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for madd3 kernel invocation var madd3_args madd3_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd3_args.argptr[0] = unsafe.Pointer(&madd3_args.arg_dst) madd3_args.argptr[1] = unsafe.Pointer(&madd3_args.arg_src1) madd3_args.argptr[2] = unsafe.Pointer(&madd3_args.arg_fac1) madd3_args.argptr[3] = unsafe.Pointer(&madd3_args.arg_src2) madd3_args.argptr[4] = unsafe.Pointer(&madd3_args.arg_fac2) madd3_args.argptr[5] = unsafe.Pointer(&madd3_args.arg_src3) madd3_args.argptr[6] = unsafe.Pointer(&madd3_args.arg_fac3) madd3_args.argptr[7] = unsafe.Pointer(&madd3_args.arg_N) } // Wrapper for madd3 CUDA kernel, asynchronous. func k_madd3_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd3") } madd3_args.Lock() defer madd3_args.Unlock() if madd3_code == 0 { madd3_code = fatbinLoad(madd3_map, "madd3") } madd3_args.arg_dst = dst madd3_args.arg_src1 = src1 madd3_args.arg_fac1 = fac1 madd3_args.arg_src2 = src2 madd3_args.arg_fac2 = fac2 madd3_args.arg_src3 = src3 madd3_args.arg_fac3 = fac3 madd3_args.arg_N = N args := madd3_args.argptr[:] cu.LaunchKernel(madd3_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd3") } } // maps compute capability on PTX code for madd3 kernel. var madd3_map = map[int]string{0: "", 50: madd3_ptx_50, 52: madd3_ptx_52, 53: madd3_ptx_53, 60: madd3_ptx_60, 61: madd3_ptx_61, 62: madd3_ptx_62, 70: madd3_ptx_70, 72: madd3_ptx_72, 75: madd3_ptx_75, 80: madd3_ptx_80, 86: madd3_ptx_86, 87: madd3_ptx_87, 89: madd3_ptx_89, 90: madd3_ptx_90} // madd3 PTX code for various compute capabilities. const ( madd3_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` madd3_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/madd4.cu000066400000000000000000000010001503346766200142000ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 extern "C" __global__ void madd4(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]); } } 3-3.11.1/cuda/madd4_wrapper.go000066400000000000000000000647401503346766200157620ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd4 kernel var madd4_code cu.Function // Stores the arguments for madd4 kernel invocation type madd4_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for madd4 kernel invocation var madd4_args madd4_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd4_args.argptr[0] = unsafe.Pointer(&madd4_args.arg_dst) madd4_args.argptr[1] = unsafe.Pointer(&madd4_args.arg_src1) madd4_args.argptr[2] = unsafe.Pointer(&madd4_args.arg_fac1) madd4_args.argptr[3] = unsafe.Pointer(&madd4_args.arg_src2) madd4_args.argptr[4] = unsafe.Pointer(&madd4_args.arg_fac2) madd4_args.argptr[5] = unsafe.Pointer(&madd4_args.arg_src3) madd4_args.argptr[6] = unsafe.Pointer(&madd4_args.arg_fac3) madd4_args.argptr[7] = unsafe.Pointer(&madd4_args.arg_src4) madd4_args.argptr[8] = unsafe.Pointer(&madd4_args.arg_fac4) madd4_args.argptr[9] = unsafe.Pointer(&madd4_args.arg_N) } // Wrapper for madd4 CUDA kernel, asynchronous. func k_madd4_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd4") } madd4_args.Lock() defer madd4_args.Unlock() if madd4_code == 0 { madd4_code = fatbinLoad(madd4_map, "madd4") } madd4_args.arg_dst = dst madd4_args.arg_src1 = src1 madd4_args.arg_fac1 = fac1 madd4_args.arg_src2 = src2 madd4_args.arg_fac2 = fac2 madd4_args.arg_src3 = src3 madd4_args.arg_fac3 = fac3 madd4_args.arg_src4 = src4 madd4_args.arg_fac4 = fac4 madd4_args.arg_N = N args := madd4_args.argptr[:] cu.LaunchKernel(madd4_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd4") } } // maps compute capability on PTX code for madd4 kernel. var madd4_map = map[int]string{0: "", 50: madd4_ptx_50, 52: madd4_ptx_52, 53: madd4_ptx_53, 60: madd4_ptx_60, 61: madd4_ptx_61, 62: madd4_ptx_62, 70: madd4_ptx_70, 72: madd4_ptx_72, 75: madd4_ptx_75, 80: madd4_ptx_80, 86: madd4_ptx_86, 87: madd4_ptx_87, 89: madd4_ptx_89, 90: madd4_ptx_90} // madd4 PTX code for various compute capabilities. const ( madd4_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` madd4_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/madd5.cu000066400000000000000000000011161503346766200142110ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 + src5[i] * fac5 extern "C" __global__ void madd5(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, float* __restrict__ src5, float fac5, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]) + (fac5*src5[i]); } } 3-3.11.1/cuda/madd5_wrapper.go000066400000000000000000000747201503346766200157620ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd5 kernel var madd5_code cu.Function // Stores the arguments for madd5 kernel invocation type madd5_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_src5 unsafe.Pointer arg_fac5 float32 arg_N int argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for madd5 kernel invocation var madd5_args madd5_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd5_args.argptr[0] = unsafe.Pointer(&madd5_args.arg_dst) madd5_args.argptr[1] = unsafe.Pointer(&madd5_args.arg_src1) madd5_args.argptr[2] = unsafe.Pointer(&madd5_args.arg_fac1) madd5_args.argptr[3] = unsafe.Pointer(&madd5_args.arg_src2) madd5_args.argptr[4] = unsafe.Pointer(&madd5_args.arg_fac2) madd5_args.argptr[5] = unsafe.Pointer(&madd5_args.arg_src3) madd5_args.argptr[6] = unsafe.Pointer(&madd5_args.arg_fac3) madd5_args.argptr[7] = unsafe.Pointer(&madd5_args.arg_src4) madd5_args.argptr[8] = unsafe.Pointer(&madd5_args.arg_fac4) madd5_args.argptr[9] = unsafe.Pointer(&madd5_args.arg_src5) madd5_args.argptr[10] = unsafe.Pointer(&madd5_args.arg_fac5) madd5_args.argptr[11] = unsafe.Pointer(&madd5_args.arg_N) } // Wrapper for madd5 CUDA kernel, asynchronous. func k_madd5_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, src5 unsafe.Pointer, fac5 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd5") } madd5_args.Lock() defer madd5_args.Unlock() if madd5_code == 0 { madd5_code = fatbinLoad(madd5_map, "madd5") } madd5_args.arg_dst = dst madd5_args.arg_src1 = src1 madd5_args.arg_fac1 = fac1 madd5_args.arg_src2 = src2 madd5_args.arg_fac2 = fac2 madd5_args.arg_src3 = src3 madd5_args.arg_fac3 = fac3 madd5_args.arg_src4 = src4 madd5_args.arg_fac4 = fac4 madd5_args.arg_src5 = src5 madd5_args.arg_fac5 = fac5 madd5_args.arg_N = N args := madd5_args.argptr[:] cu.LaunchKernel(madd5_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd5") } } // maps compute capability on PTX code for madd5 kernel. var madd5_map = map[int]string{0: "", 50: madd5_ptx_50, 52: madd5_ptx_52, 53: madd5_ptx_53, 60: madd5_ptx_60, 61: madd5_ptx_61, 62: madd5_ptx_62, 70: madd5_ptx_70, 72: madd5_ptx_72, 75: madd5_ptx_75, 80: madd5_ptx_80, 86: madd5_ptx_86, 87: madd5_ptx_87, 89: madd5_ptx_89, 90: madd5_ptx_90} // madd5 PTX code for various compute capabilities. const ( madd5_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` madd5_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/madd6.cu000066400000000000000000000012341503346766200142130ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 + src5[i] * fac5 + src6[i] * fac6 extern "C" __global__ void madd6(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, float* __restrict__ src5, float fac5, float* __restrict__ src6, float fac6, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]) + (fac5*src5[i]) + (fac6*src6[i]); } } 3-3.11.1/cuda/madd6_wrapper.go000066400000000000000000001047001503346766200157530ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd6 kernel var madd6_code cu.Function // Stores the arguments for madd6 kernel invocation type madd6_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_src5 unsafe.Pointer arg_fac5 float32 arg_src6 unsafe.Pointer arg_fac6 float32 arg_N int argptr [14]unsafe.Pointer sync.Mutex } // Stores the arguments for madd6 kernel invocation var madd6_args madd6_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd6_args.argptr[0] = unsafe.Pointer(&madd6_args.arg_dst) madd6_args.argptr[1] = unsafe.Pointer(&madd6_args.arg_src1) madd6_args.argptr[2] = unsafe.Pointer(&madd6_args.arg_fac1) madd6_args.argptr[3] = unsafe.Pointer(&madd6_args.arg_src2) madd6_args.argptr[4] = unsafe.Pointer(&madd6_args.arg_fac2) madd6_args.argptr[5] = unsafe.Pointer(&madd6_args.arg_src3) madd6_args.argptr[6] = unsafe.Pointer(&madd6_args.arg_fac3) madd6_args.argptr[7] = unsafe.Pointer(&madd6_args.arg_src4) madd6_args.argptr[8] = unsafe.Pointer(&madd6_args.arg_fac4) madd6_args.argptr[9] = unsafe.Pointer(&madd6_args.arg_src5) madd6_args.argptr[10] = unsafe.Pointer(&madd6_args.arg_fac5) madd6_args.argptr[11] = unsafe.Pointer(&madd6_args.arg_src6) madd6_args.argptr[12] = unsafe.Pointer(&madd6_args.arg_fac6) madd6_args.argptr[13] = unsafe.Pointer(&madd6_args.arg_N) } // Wrapper for madd6 CUDA kernel, asynchronous. func k_madd6_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, src5 unsafe.Pointer, fac5 float32, src6 unsafe.Pointer, fac6 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd6") } madd6_args.Lock() defer madd6_args.Unlock() if madd6_code == 0 { madd6_code = fatbinLoad(madd6_map, "madd6") } madd6_args.arg_dst = dst madd6_args.arg_src1 = src1 madd6_args.arg_fac1 = fac1 madd6_args.arg_src2 = src2 madd6_args.arg_fac2 = fac2 madd6_args.arg_src3 = src3 madd6_args.arg_fac3 = fac3 madd6_args.arg_src4 = src4 madd6_args.arg_fac4 = fac4 madd6_args.arg_src5 = src5 madd6_args.arg_fac5 = fac5 madd6_args.arg_src6 = src6 madd6_args.arg_fac6 = fac6 madd6_args.arg_N = N args := madd6_args.argptr[:] cu.LaunchKernel(madd6_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd6") } } // maps compute capability on PTX code for madd6 kernel. var madd6_map = map[int]string{0: "", 50: madd6_ptx_50, 52: madd6_ptx_52, 53: madd6_ptx_53, 60: madd6_ptx_60, 61: madd6_ptx_61, 62: madd6_ptx_62, 70: madd6_ptx_70, 72: madd6_ptx_72, 75: madd6_ptx_75, 80: madd6_ptx_80, 86: madd6_ptx_86, 87: madd6_ptx_87, 89: madd6_ptx_89, 90: madd6_ptx_90} // madd6 PTX code for various compute capabilities. const ( madd6_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` madd6_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/madd7.cu000066400000000000000000000013521503346766200142150ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 + src5[i] * fac5 + src6[i] * fac6 + src7[i] * fac7 extern "C" __global__ void madd7(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, float* __restrict__ src5, float fac5, float* __restrict__ src6, float fac6, float* __restrict__ src7, float fac7, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]) + (fac5*src5[i]) + (fac6*src6[i]) + (fac7*src7[i]); } } 3-3.11.1/cuda/madd7_wrapper.go000066400000000000000000001150221503346766200157530ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd7 kernel var madd7_code cu.Function // Stores the arguments for madd7 kernel invocation type madd7_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_src5 unsafe.Pointer arg_fac5 float32 arg_src6 unsafe.Pointer arg_fac6 float32 arg_src7 unsafe.Pointer arg_fac7 float32 arg_N int argptr [16]unsafe.Pointer sync.Mutex } // Stores the arguments for madd7 kernel invocation var madd7_args madd7_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd7_args.argptr[0] = unsafe.Pointer(&madd7_args.arg_dst) madd7_args.argptr[1] = unsafe.Pointer(&madd7_args.arg_src1) madd7_args.argptr[2] = unsafe.Pointer(&madd7_args.arg_fac1) madd7_args.argptr[3] = unsafe.Pointer(&madd7_args.arg_src2) madd7_args.argptr[4] = unsafe.Pointer(&madd7_args.arg_fac2) madd7_args.argptr[5] = unsafe.Pointer(&madd7_args.arg_src3) madd7_args.argptr[6] = unsafe.Pointer(&madd7_args.arg_fac3) madd7_args.argptr[7] = unsafe.Pointer(&madd7_args.arg_src4) madd7_args.argptr[8] = unsafe.Pointer(&madd7_args.arg_fac4) madd7_args.argptr[9] = unsafe.Pointer(&madd7_args.arg_src5) madd7_args.argptr[10] = unsafe.Pointer(&madd7_args.arg_fac5) madd7_args.argptr[11] = unsafe.Pointer(&madd7_args.arg_src6) madd7_args.argptr[12] = unsafe.Pointer(&madd7_args.arg_fac6) madd7_args.argptr[13] = unsafe.Pointer(&madd7_args.arg_src7) madd7_args.argptr[14] = unsafe.Pointer(&madd7_args.arg_fac7) madd7_args.argptr[15] = unsafe.Pointer(&madd7_args.arg_N) } // Wrapper for madd7 CUDA kernel, asynchronous. func k_madd7_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, src5 unsafe.Pointer, fac5 float32, src6 unsafe.Pointer, fac6 float32, src7 unsafe.Pointer, fac7 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd7") } madd7_args.Lock() defer madd7_args.Unlock() if madd7_code == 0 { madd7_code = fatbinLoad(madd7_map, "madd7") } madd7_args.arg_dst = dst madd7_args.arg_src1 = src1 madd7_args.arg_fac1 = fac1 madd7_args.arg_src2 = src2 madd7_args.arg_fac2 = fac2 madd7_args.arg_src3 = src3 madd7_args.arg_fac3 = fac3 madd7_args.arg_src4 = src4 madd7_args.arg_fac4 = fac4 madd7_args.arg_src5 = src5 madd7_args.arg_fac5 = fac5 madd7_args.arg_src6 = src6 madd7_args.arg_fac6 = fac6 madd7_args.arg_src7 = src7 madd7_args.arg_fac7 = fac7 madd7_args.arg_N = N args := madd7_args.argptr[:] cu.LaunchKernel(madd7_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd7") } } // maps compute capability on PTX code for madd7 kernel. var madd7_map = map[int]string{0: "", 50: madd7_ptx_50, 52: madd7_ptx_52, 53: madd7_ptx_53, 60: madd7_ptx_60, 61: madd7_ptx_61, 62: madd7_ptx_62, 70: madd7_ptx_70, 72: madd7_ptx_72, 75: madd7_ptx_75, 80: madd7_ptx_80, 86: madd7_ptx_86, 87: madd7_ptx_87, 89: madd7_ptx_89, 90: madd7_ptx_90} // madd7 PTX code for various compute capabilities. const ( madd7_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` madd7_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/magnetoelastic.go000066400000000000000000000031071503346766200162160ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add magneto-elastic coupling field to the effective field. // see magnetoelasticfield.cu func AddMagnetoelasticField(Beff, m *data.Slice, exx, eyy, ezz, exy, exz, eyz, B1, B2, Msat MSlice) { util.Argument(Beff.Size() == m.Size()) util.Argument(Beff.Size() == exx.Size()) util.Argument(Beff.Size() == eyy.Size()) util.Argument(Beff.Size() == ezz.Size()) util.Argument(Beff.Size() == exy.Size()) util.Argument(Beff.Size() == exz.Size()) util.Argument(Beff.Size() == eyz.Size()) N := Beff.Len() cfg := make1DConf(N) k_addmagnetoelasticfield_async(Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), exx.DevPtr(0), exx.Mul(0), eyy.DevPtr(0), eyy.Mul(0), ezz.DevPtr(0), ezz.Mul(0), exy.DevPtr(0), exy.Mul(0), exz.DevPtr(0), exz.Mul(0), eyz.DevPtr(0), eyz.Mul(0), B1.DevPtr(0), B1.Mul(0), B2.DevPtr(0), B2.Mul(0), Msat.DevPtr(0), Msat.Mul(0), N, cfg) } // Calculate magneto-elastic force density // see magnetoelasticforce.cu func GetMagnetoelasticForceDensity(out, m *data.Slice, B1, B2 MSlice, mesh *data.Mesh) { util.Argument(out.Size() == m.Size()) cellsize := mesh.CellSize() N := mesh.Size() cfg := make3DConf(N) rcsx := float32(1.0 / cellsize[X]) rcsy := float32(1.0 / cellsize[Y]) rcsz := float32(1.0 / cellsize[Z]) k_getmagnetoelasticforce_async(out.DevPtr(X), out.DevPtr(Y), out.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), B1.DevPtr(0), B1.Mul(0), B2.DevPtr(0), B2.Mul(0), rcsx, rcsy, rcsz, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } 3-3.11.1/cuda/magnetoelasticfield.cu000066400000000000000000000033211503346766200172220ustar00rootroot00000000000000#include #include #include "amul.h" #include "float3.h" // Add magneto-elastic coupling field to B. // H = - δUmel / δM, // where Umel is magneto-elastic energy density given by the eq. (12.18) of Gurevich&Melkov "Magnetization Oscillations and Waves", CRC Press, 1996 extern "C" __global__ void addmagnetoelasticfield(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ exx_, float exx_mul, float* __restrict__ eyy_, float eyy_mul, float* __restrict__ ezz_, float ezz_mul, float* __restrict__ exy_, float exy_mul, float* __restrict__ exz_, float exz_mul, float* __restrict__ eyz_, float eyz_mul, float* __restrict__ B1_, float B1_mul, float* __restrict__ B2_, float B2_mul, float* __restrict__ Ms_, float Ms_mul, int N) { int I = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (I < N) { float Exx = amul(exx_, exx_mul, I); float Eyy = amul(eyy_, eyy_mul, I); float Ezz = amul(ezz_, ezz_mul, I); float Exy = amul(exy_, exy_mul, I); float Eyx = Exy; float Exz = amul(exz_, exz_mul, I); float Ezx = Exz; float Eyz = amul(eyz_, eyz_mul, I); float Ezy = Eyz; float invMs = inv_Msat(Ms_, Ms_mul, I); float B1 = amul(B1_, B1_mul, I) * invMs; float B2 = amul(B2_, B2_mul, I) * invMs; float3 m = {mx[I], my[I], mz[I]}; Bx[I] += -2.0f*(B1*m.x*Exx + B2*(m.y*Exy + m.z*Exz)); By[I] += -2.0f*(B1*m.y*Eyy + B2*(m.x*Eyx + m.z*Eyz)); Bz[I] += -2.0f*(B1*m.z*Ezz + B2*(m.x*Ezx + m.y*Ezy)); } } 3-3.11.1/cuda/magnetoelasticfield_wrapper.go000066400000000000000000003007731503346766200207730ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addmagnetoelasticfield kernel var addmagnetoelasticfield_code cu.Function // Stores the arguments for addmagnetoelasticfield kernel invocation type addmagnetoelasticfield_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_exx_ unsafe.Pointer arg_exx_mul float32 arg_eyy_ unsafe.Pointer arg_eyy_mul float32 arg_ezz_ unsafe.Pointer arg_ezz_mul float32 arg_exy_ unsafe.Pointer arg_exy_mul float32 arg_exz_ unsafe.Pointer arg_exz_mul float32 arg_eyz_ unsafe.Pointer arg_eyz_mul float32 arg_B1_ unsafe.Pointer arg_B1_mul float32 arg_B2_ unsafe.Pointer arg_B2_mul float32 arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_N int argptr [25]unsafe.Pointer sync.Mutex } // Stores the arguments for addmagnetoelasticfield kernel invocation var addmagnetoelasticfield_args addmagnetoelasticfield_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addmagnetoelasticfield_args.argptr[0] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bx) addmagnetoelasticfield_args.argptr[1] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_By) addmagnetoelasticfield_args.argptr[2] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bz) addmagnetoelasticfield_args.argptr[3] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mx) addmagnetoelasticfield_args.argptr[4] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_my) addmagnetoelasticfield_args.argptr[5] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mz) addmagnetoelasticfield_args.argptr[6] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_) addmagnetoelasticfield_args.argptr[7] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_mul) addmagnetoelasticfield_args.argptr[8] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_) addmagnetoelasticfield_args.argptr[9] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_mul) addmagnetoelasticfield_args.argptr[10] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_) addmagnetoelasticfield_args.argptr[11] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_mul) addmagnetoelasticfield_args.argptr[12] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_) addmagnetoelasticfield_args.argptr[13] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_mul) addmagnetoelasticfield_args.argptr[14] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_) addmagnetoelasticfield_args.argptr[15] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_mul) addmagnetoelasticfield_args.argptr[16] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_) addmagnetoelasticfield_args.argptr[17] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_mul) addmagnetoelasticfield_args.argptr[18] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_) addmagnetoelasticfield_args.argptr[19] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_mul) addmagnetoelasticfield_args.argptr[20] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_) addmagnetoelasticfield_args.argptr[21] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_mul) addmagnetoelasticfield_args.argptr[22] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_) addmagnetoelasticfield_args.argptr[23] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_mul) addmagnetoelasticfield_args.argptr[24] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_N) } // Wrapper for addmagnetoelasticfield CUDA kernel, asynchronous. func k_addmagnetoelasticfield_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, exx_ unsafe.Pointer, exx_mul float32, eyy_ unsafe.Pointer, eyy_mul float32, ezz_ unsafe.Pointer, ezz_mul float32, exy_ unsafe.Pointer, exy_mul float32, exz_ unsafe.Pointer, exz_mul float32, eyz_ unsafe.Pointer, eyz_mul float32, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, Ms_ unsafe.Pointer, Ms_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("addmagnetoelasticfield") } addmagnetoelasticfield_args.Lock() defer addmagnetoelasticfield_args.Unlock() if addmagnetoelasticfield_code == 0 { addmagnetoelasticfield_code = fatbinLoad(addmagnetoelasticfield_map, "addmagnetoelasticfield") } addmagnetoelasticfield_args.arg_Bx = Bx addmagnetoelasticfield_args.arg_By = By addmagnetoelasticfield_args.arg_Bz = Bz addmagnetoelasticfield_args.arg_mx = mx addmagnetoelasticfield_args.arg_my = my addmagnetoelasticfield_args.arg_mz = mz addmagnetoelasticfield_args.arg_exx_ = exx_ addmagnetoelasticfield_args.arg_exx_mul = exx_mul addmagnetoelasticfield_args.arg_eyy_ = eyy_ addmagnetoelasticfield_args.arg_eyy_mul = eyy_mul addmagnetoelasticfield_args.arg_ezz_ = ezz_ addmagnetoelasticfield_args.arg_ezz_mul = ezz_mul addmagnetoelasticfield_args.arg_exy_ = exy_ addmagnetoelasticfield_args.arg_exy_mul = exy_mul addmagnetoelasticfield_args.arg_exz_ = exz_ addmagnetoelasticfield_args.arg_exz_mul = exz_mul addmagnetoelasticfield_args.arg_eyz_ = eyz_ addmagnetoelasticfield_args.arg_eyz_mul = eyz_mul addmagnetoelasticfield_args.arg_B1_ = B1_ addmagnetoelasticfield_args.arg_B1_mul = B1_mul addmagnetoelasticfield_args.arg_B2_ = B2_ addmagnetoelasticfield_args.arg_B2_mul = B2_mul addmagnetoelasticfield_args.arg_Ms_ = Ms_ addmagnetoelasticfield_args.arg_Ms_mul = Ms_mul addmagnetoelasticfield_args.arg_N = N args := addmagnetoelasticfield_args.argptr[:] cu.LaunchKernel(addmagnetoelasticfield_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addmagnetoelasticfield") } } // maps compute capability on PTX code for addmagnetoelasticfield kernel. var addmagnetoelasticfield_map = map[int]string{0: "", 50: addmagnetoelasticfield_ptx_50, 52: addmagnetoelasticfield_ptx_52, 53: addmagnetoelasticfield_ptx_53, 60: addmagnetoelasticfield_ptx_60, 61: addmagnetoelasticfield_ptx_61, 62: addmagnetoelasticfield_ptx_62, 70: addmagnetoelasticfield_ptx_70, 72: addmagnetoelasticfield_ptx_72, 75: addmagnetoelasticfield_ptx_75, 80: addmagnetoelasticfield_ptx_80, 86: addmagnetoelasticfield_ptx_86, 87: addmagnetoelasticfield_ptx_87, 89: addmagnetoelasticfield_ptx_89, 90: addmagnetoelasticfield_ptx_90} // addmagnetoelasticfield PTX code for various compute capabilities. const ( addmagnetoelasticfield_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` addmagnetoelasticfield_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<79>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f69, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f70, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f71, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f72, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f73, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f74, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f77, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f78, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f75, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f69, %f30, %f69; $L__BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f70, %f31, %f70; $L__BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f71, %f32, %f71; $L__BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f72, %f33, %f72; $L__BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f73, %f34, %f73; $L__BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f74, %f35, %f74; $L__BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f75, %f36, %f75; $L__BB0_15: setp.eq.f32 %p9, %f75, 0f00000000; mov.f32 %f76, 0f00000000; @%p9 bra $L__BB0_17; rcp.rn.f32 %f76, %f75; $L__BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f77, %f38, %f77; $L__BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f78, %f39, %f78; $L__BB0_21: mul.f32 %f40, %f76, %f77; cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; ld.global.nc.f32 %f41, [%rd45]; mul.f32 %f42, %f40, %f41; ld.global.nc.f32 %f43, [%rd47]; ld.global.nc.f32 %f44, [%rd49]; mul.f32 %f45, %f73, %f44; fma.rn.f32 %f46, %f72, %f43, %f45; mul.f32 %f47, %f76, %f78; mul.f32 %f48, %f47, %f46; fma.rn.f32 %f49, %f69, %f42, %f48; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f50, [%rd51]; add.f32 %f51, %f49, %f49; sub.f32 %f52, %f50, %f51; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f40, %f43; mul.f32 %f54, %f74, %f44; fma.rn.f32 %f55, %f72, %f41, %f54; mul.f32 %f56, %f47, %f55; fma.rn.f32 %f57, %f70, %f53, %f56; add.f32 %f58, %f57, %f57; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f59, [%rd53]; sub.f32 %f60, %f59, %f58; st.global.f32 [%rd53], %f60; mul.f32 %f61, %f40, %f44; mul.f32 %f62, %f74, %f43; fma.rn.f32 %f63, %f73, %f41, %f62; mul.f32 %f64, %f47, %f63; fma.rn.f32 %f65, %f71, %f61, %f64; add.f32 %f66, %f65, %f65; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f67, [%rd55]; sub.f32 %f68, %f67, %f66; st.global.f32 [%rd55], %f68; $L__BB0_22: ret; } ` ) 3-3.11.1/cuda/magnetoelasticforce.cu000066400000000000000000000174051503346766200172450ustar00rootroot00000000000000#include #include #include "amul.h" #include "float3.h" #include "stencil.h" // Calculate magneto-elastic force density // fmelp = Σ ∂σpq / ∂xq (q = x, y, z) , σpq = ∂Umel / ∂epq, // where epq is the strain tensor and // Umel is the magneto-elastic energy density given by the eq. (12.18) of Gurevich&Melkov "Magnetization Oscillations and Waves", CRC Press, 1996 extern "C" __global__ void getmagnetoelasticforce(float* __restrict__ fx, float* __restrict__ fy, float* __restrict__ fz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ B1_, float B1_mul, float* __restrict__ B2_, float B2_mul, float rcsx, float rcsy, float rcsz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[I], my[I], mz[I]); // +0 float3 dmdx = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂x float3 dmdy = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdz = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂z int i_; // neighbor index // ∂m/∂x { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); // -2 i_ = idx(lclampx(ix-2), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-2 >= 0 || PBCx) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); // -1 i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); // +1 i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); // +2 i_ = idx(hclampx(ix+2), iy, iz); if (ix+2 < Nx || PBCx) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdx = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdx = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdx = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdx = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdx = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdx = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdx = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // ∂m/∂y { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-2), iz); if (iy-2 >= 0 || PBCy) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+2), iz); if (iy+2 < Ny || PBCy) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdy = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdy = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdy = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdy = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdy = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdy = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdy = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // ∂u/∂z { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-2)); if (iz-2 >= 0 || PBCz) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-1)); if (iz-1 >= 0 || PBCz) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+1)); if (iz+1 < Nz || PBCz) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+2)); if (iz+2 < Nz || PBCz) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdz = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdz = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdz = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdz = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdz = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdz = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdz = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } dmdx *= rcsx; dmdy *= rcsy; dmdz *= rcsz; float B1 = amul(B1_, B1_mul, I); float B2 = amul(B2_, B2_mul, I); fx[I] = 2.0f*B1*m0.x*dmdx.x + B2*(m0.x*(dmdy.y + dmdz.z) + m0.y*dmdy.x + m0.z*dmdz.x); fy[I] = 2.0f*B1*m0.y*dmdy.y + B2*(m0.x*dmdx.y + m0.y*(dmdx.x + dmdz.z) + m0.z*dmdz.y); fz[I] = 2.0f*B1*m0.z*dmdz.z + B2*(m0.x*dmdx.z + m0.y*dmdy.z + m0.z*(dmdx.x + dmdy.y)); } 3-3.11.1/cuda/magnetoelasticforce_wrapper.go000066400000000000000000012415431503346766200210060ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for getmagnetoelasticforce kernel var getmagnetoelasticforce_code cu.Function // Stores the arguments for getmagnetoelasticforce kernel invocation type getmagnetoelasticforce_args_t struct { arg_fx unsafe.Pointer arg_fy unsafe.Pointer arg_fz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_B1_ unsafe.Pointer arg_B1_mul float32 arg_B2_ unsafe.Pointer arg_B2_mul float32 arg_rcsx float32 arg_rcsy float32 arg_rcsz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [17]unsafe.Pointer sync.Mutex } // Stores the arguments for getmagnetoelasticforce kernel invocation var getmagnetoelasticforce_args getmagnetoelasticforce_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. getmagnetoelasticforce_args.argptr[0] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fx) getmagnetoelasticforce_args.argptr[1] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fy) getmagnetoelasticforce_args.argptr[2] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fz) getmagnetoelasticforce_args.argptr[3] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mx) getmagnetoelasticforce_args.argptr[4] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_my) getmagnetoelasticforce_args.argptr[5] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mz) getmagnetoelasticforce_args.argptr[6] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_) getmagnetoelasticforce_args.argptr[7] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_mul) getmagnetoelasticforce_args.argptr[8] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_) getmagnetoelasticforce_args.argptr[9] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_mul) getmagnetoelasticforce_args.argptr[10] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsx) getmagnetoelasticforce_args.argptr[11] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsy) getmagnetoelasticforce_args.argptr[12] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsz) getmagnetoelasticforce_args.argptr[13] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nx) getmagnetoelasticforce_args.argptr[14] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Ny) getmagnetoelasticforce_args.argptr[15] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nz) getmagnetoelasticforce_args.argptr[16] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_PBC) } // Wrapper for getmagnetoelasticforce CUDA kernel, asynchronous. func k_getmagnetoelasticforce_async(fx unsafe.Pointer, fy unsafe.Pointer, fz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, rcsx float32, rcsy float32, rcsz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("getmagnetoelasticforce") } getmagnetoelasticforce_args.Lock() defer getmagnetoelasticforce_args.Unlock() if getmagnetoelasticforce_code == 0 { getmagnetoelasticforce_code = fatbinLoad(getmagnetoelasticforce_map, "getmagnetoelasticforce") } getmagnetoelasticforce_args.arg_fx = fx getmagnetoelasticforce_args.arg_fy = fy getmagnetoelasticforce_args.arg_fz = fz getmagnetoelasticforce_args.arg_mx = mx getmagnetoelasticforce_args.arg_my = my getmagnetoelasticforce_args.arg_mz = mz getmagnetoelasticforce_args.arg_B1_ = B1_ getmagnetoelasticforce_args.arg_B1_mul = B1_mul getmagnetoelasticforce_args.arg_B2_ = B2_ getmagnetoelasticforce_args.arg_B2_mul = B2_mul getmagnetoelasticforce_args.arg_rcsx = rcsx getmagnetoelasticforce_args.arg_rcsy = rcsy getmagnetoelasticforce_args.arg_rcsz = rcsz getmagnetoelasticforce_args.arg_Nx = Nx getmagnetoelasticforce_args.arg_Ny = Ny getmagnetoelasticforce_args.arg_Nz = Nz getmagnetoelasticforce_args.arg_PBC = PBC args := getmagnetoelasticforce_args.argptr[:] cu.LaunchKernel(getmagnetoelasticforce_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("getmagnetoelasticforce") } } // maps compute capability on PTX code for getmagnetoelasticforce kernel. var getmagnetoelasticforce_map = map[int]string{0: "", 50: getmagnetoelasticforce_ptx_50, 52: getmagnetoelasticforce_ptx_52, 53: getmagnetoelasticforce_ptx_53, 60: getmagnetoelasticforce_ptx_60, 61: getmagnetoelasticforce_ptx_61, 62: getmagnetoelasticforce_ptx_62, 70: getmagnetoelasticforce_ptx_70, 72: getmagnetoelasticforce_ptx_72, 75: getmagnetoelasticforce_ptx_75, 80: getmagnetoelasticforce_ptx_80, 86: getmagnetoelasticforce_ptx_86, 87: getmagnetoelasticforce_ptx_87, 89: getmagnetoelasticforce_ptx_89, 90: getmagnetoelasticforce_ptx_90} // getmagnetoelasticforce PTX code for various compute capabilities. const ( getmagnetoelasticforce_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` getmagnetoelasticforce_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<116>; .reg .b16 %rs<5>; .reg .f32 %f<489>; .reg .b32 %r<130>; .reg .b64 %rd<78>; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd12, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_6]; ld.param.f32 %f487, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_8]; ld.param.f32 %f488, [getmagnetoelasticforce_param_9]; ld.param.f32 %f211, [getmagnetoelasticforce_param_10]; ld.param.f32 %f212, [getmagnetoelasticforce_param_11]; ld.param.f32 %f213, [getmagnetoelasticforce_param_12]; ld.param.u32 %r57, [getmagnetoelasticforce_param_13]; ld.param.u32 %r58, [getmagnetoelasticforce_param_14]; ld.param.u32 %r59, [getmagnetoelasticforce_param_15]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd10; mov.u32 %r60, %ntid.x; mov.u32 %r61, %ctaid.x; mov.u32 %r62, %tid.x; mad.lo.s32 %r1, %r61, %r60, %r62; mov.u32 %r63, %ntid.y; mov.u32 %r64, %ctaid.y; mov.u32 %r65, %tid.y; mad.lo.s32 %r2, %r64, %r63, %r65; mov.u32 %r66, %ntid.z; mov.u32 %r67, %ctaid.z; mov.u32 %r68, %tid.z; mad.lo.s32 %r3, %r67, %r66, %r68; setp.ge.s32 %p1, %r1, %r57; setp.ge.s32 %p2, %r2, %r58; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r59; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_108; mul.lo.s32 %r4, %r3, %r58; add.s32 %r69, %r4, %r2; mul.lo.s32 %r5, %r69, %r57; add.s32 %r70, %r5, %r1; cvt.s64.s32 %rd4, %r70; mul.wide.s32 %rd13, %r70, 4; add.s64 %rd14, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; add.s64 %rd15, %rd2, %rd13; ld.global.nc.f32 %f2, [%rd15]; add.s64 %rd16, %rd1, %rd13; ld.global.nc.f32 %f3, [%rd16]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p6, %rs1, 0; add.s32 %r6, %r1, -2; @%p6 bra $L__BB0_3; bra.uni $L__BB0_2; $L__BB0_3: max.s32 %r118, %r6, 0; bra.uni $L__BB0_4; $L__BB0_2: rem.s32 %r71, %r6, %r57; add.s32 %r72, %r71, %r57; rem.s32 %r118, %r72, %r57; $L__BB0_4: setp.lt.s32 %p8, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p9, %p8, %p6; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p9 bra $L__BB0_6; add.s32 %r73, %r118, %r5; mul.wide.s32 %rd17, %r73, 4; add.s64 %rd18, %rd3, %rd17; add.s64 %rd19, %rd2, %rd17; add.s64 %rd20, %rd1, %rd17; ld.global.nc.f32 %f9, [%rd20]; ld.global.nc.f32 %f8, [%rd19]; ld.global.nc.f32 %f7, [%rd18]; $L__BB0_6: add.s32 %r10, %r1, -1; @%p6 bra $L__BB0_8; bra.uni $L__BB0_7; $L__BB0_8: max.s32 %r119, %r10, 0; bra.uni $L__BB0_9; $L__BB0_7: rem.s32 %r74, %r10, %r57; add.s32 %r75, %r74, %r57; rem.s32 %r119, %r75, %r57; $L__BB0_9: setp.lt.s32 %p11, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p13, %p11, %p6; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p13 bra $L__BB0_11; add.s32 %r76, %r119, %r5; mul.wide.s32 %rd21, %r76, 4; add.s64 %rd22, %rd3, %rd21; add.s64 %rd23, %rd2, %rd21; add.s64 %rd24, %rd1, %rd21; ld.global.nc.f32 %f38, [%rd24]; ld.global.nc.f32 %f39, [%rd23]; ld.global.nc.f32 %f40, [%rd22]; $L__BB0_11: add.s32 %r14, %r1, 1; @%p6 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: add.s32 %r79, %r57, -1; min.s32 %r120, %r14, %r79; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r77, %r14, %r57; add.s32 %r78, %r77, %r57; rem.s32 %r120, %r78, %r57; $L__BB0_14: setp.ge.s32 %p15, %r14, %r57; mov.f32 %f19, 0f00000000; and.pred %p17, %p15, %p6; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p17 bra $L__BB0_16; add.s32 %r80, %r120, %r5; mul.wide.s32 %rd25, %r80, 4; add.s64 %rd26, %rd3, %rd25; add.s64 %rd27, %rd2, %rd25; add.s64 %rd28, %rd1, %rd25; ld.global.nc.f32 %f21, [%rd28]; ld.global.nc.f32 %f20, [%rd27]; ld.global.nc.f32 %f19, [%rd26]; $L__BB0_16: add.s32 %r18, %r1, 2; @%p6 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r83, %r57, -1; min.s32 %r121, %r18, %r83; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r81, %r18, %r57; add.s32 %r82, %r81, %r57; rem.s32 %r121, %r82, %r57; $L__BB0_19: add.s32 %r22, %r121, %r5; setp.ge.s32 %p19, %r18, %r57; mov.f32 %f25, 0f00000000; and.pred %p21, %p19, %p6; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p21 bra $L__BB0_21; mul.wide.s32 %rd29, %r22, 4; add.s64 %rd30, %rd3, %rd29; add.s64 %rd31, %rd2, %rd29; add.s64 %rd32, %rd1, %rd29; ld.global.nc.f32 %f27, [%rd32]; ld.global.nc.f32 %f26, [%rd31]; ld.global.nc.f32 %f25, [%rd30]; $L__BB0_21: mul.f32 %f226, %f20, %f20; fma.rn.f32 %f227, %f19, %f19, %f226; fma.rn.f32 %f31, %f21, %f21, %f227; setp.eq.f32 %p22, %f31, 0f00000000; @%p22 bra $L__BB0_22; bra.uni $L__BB0_23; $L__BB0_22: mul.f32 %f231, %f39, %f39; fma.rn.f32 %f232, %f40, %f40, %f231; fma.rn.f32 %f233, %f38, %f38, %f232; setp.eq.f32 %p23, %f233, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; @%p23 bra $L__BB0_35; $L__BB0_23: mul.f32 %f234, %f8, %f8; fma.rn.f32 %f235, %f7, %f7, %f234; fma.rn.f32 %f44, %f9, %f9, %f235; setp.neu.f32 %p24, %f44, 0f00000000; mul.f32 %f236, %f26, %f26; fma.rn.f32 %f237, %f25, %f25, %f236; fma.rn.f32 %f48, %f27, %f27, %f237; setp.neu.f32 %p25, %f48, 0f00000000; and.pred %p26, %p24, %p25; or.pred %p28, %p22, %p26; @%p28 bra $L__BB0_25; mul.f32 %f238, %f39, %f39; fma.rn.f32 %f239, %f40, %f40, %f238; fma.rn.f32 %f240, %f38, %f38, %f239; setp.neu.f32 %p29, %f240, 0f00000000; @%p29 bra $L__BB0_34; bra.uni $L__BB0_25; $L__BB0_34: sub.f32 %f273, %f19, %f40; mul.f32 %f450, %f273, 0f3F000000; sub.f32 %f274, %f20, %f39; mul.f32 %f449, %f274, 0f3F000000; sub.f32 %f275, %f21, %f38; mul.f32 %f448, %f275, 0f3F000000; bra.uni $L__BB0_35; $L__BB0_25: setp.eq.f32 %p30, %f44, 0f00000000; and.pred %p32, %p30, %p22; @%p32 bra $L__BB0_33; bra.uni $L__BB0_26; $L__BB0_33: sub.f32 %f450, %f1, %f40; sub.f32 %f449, %f2, %f39; sub.f32 %f448, %f3, %f38; bra.uni $L__BB0_35; $L__BB0_26: setp.eq.f32 %p33, %f48, 0f00000000; mul.f32 %f241, %f39, %f39; fma.rn.f32 %f242, %f40, %f40, %f241; fma.rn.f32 %f49, %f38, %f38, %f242; setp.eq.f32 %p34, %f49, 0f00000000; and.pred %p35, %p34, %p33; @%p35 bra $L__BB0_32; bra.uni $L__BB0_27; $L__BB0_32: sub.f32 %f450, %f19, %f1; sub.f32 %f449, %f20, %f2; sub.f32 %f448, %f21, %f3; bra.uni $L__BB0_35; $L__BB0_27: setp.neu.f32 %p37, %f31, 0f00000000; or.pred %p38, %p30, %p37; @%p38 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: setp.neu.f32 %p39, %f49, 0f00000000; or.pred %p41, %p33, %p39; @%p41 bra $L__BB0_31; bra.uni $L__BB0_30; $L__BB0_31: sub.f32 %f264, %f19, %f40; sub.f32 %f265, %f20, %f39; sub.f32 %f266, %f21, %f38; sub.f32 %f267, %f7, %f25; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f8, %f26; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f9, %f27; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f450, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f449, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f448, %f266, 0f3F2AAAAB, %f272; bra.uni $L__BB0_35; $L__BB0_28: mul.f32 %f243, %f7, 0f3F000000; add.f32 %f244, %f40, %f40; sub.f32 %f245, %f243, %f244; add.f32 %f246, %f39, %f39; mul.f32 %f247, %f8, 0f3F000000; sub.f32 %f248, %f247, %f246; add.f32 %f249, %f38, %f38; mul.f32 %f250, %f9, 0f3F000000; sub.f32 %f251, %f250, %f249; fma.rn.f32 %f450, %f1, 0f3FC00000, %f245; fma.rn.f32 %f449, %f2, 0f3FC00000, %f248; fma.rn.f32 %f448, %f3, 0f3FC00000, %f251; bra.uni $L__BB0_35; $L__BB0_30: mul.f32 %f252, %f25, 0f3F000000; add.f32 %f253, %f19, %f19; sub.f32 %f254, %f253, %f252; add.f32 %f255, %f20, %f20; mul.f32 %f256, %f26, 0f3F000000; sub.f32 %f257, %f255, %f256; add.f32 %f258, %f21, %f21; mul.f32 %f259, %f27, 0f3F000000; sub.f32 %f260, %f258, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f450, %f254, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f449, %f257, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f448, %f260, %f263; $L__BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p42, %rs2, 0; add.s32 %r23, %r2, -2; @%p42 bra $L__BB0_37; bra.uni $L__BB0_36; $L__BB0_37: max.s32 %r122, %r23, 0; bra.uni $L__BB0_38; $L__BB0_36: rem.s32 %r84, %r23, %r58; add.s32 %r85, %r84, %r58; rem.s32 %r122, %r85, %r58; $L__BB0_38: setp.lt.s32 %p44, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p45, %p44, %p42; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p45 bra $L__BB0_40; add.s32 %r86, %r122, %r4; mad.lo.s32 %r87, %r86, %r57, %r1; mul.wide.s32 %rd33, %r87, 4; add.s64 %rd34, %rd3, %rd33; add.s64 %rd35, %rd2, %rd33; add.s64 %rd36, %rd1, %rd33; ld.global.nc.f32 %f76, [%rd36]; ld.global.nc.f32 %f75, [%rd35]; ld.global.nc.f32 %f74, [%rd34]; $L__BB0_40: add.s32 %r27, %r2, -1; @%p42 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: max.s32 %r123, %r27, 0; bra.uni $L__BB0_43; $L__BB0_41: rem.s32 %r88, %r27, %r58; add.s32 %r89, %r88, %r58; rem.s32 %r123, %r89, %r58; $L__BB0_43: setp.lt.s32 %p47, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p49, %p47, %p42; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p49 bra $L__BB0_45; add.s32 %r90, %r123, %r4; mad.lo.s32 %r91, %r90, %r57, %r1; mul.wide.s32 %rd37, %r91, 4; add.s64 %rd38, %rd3, %rd37; add.s64 %rd39, %rd2, %rd37; add.s64 %rd40, %rd1, %rd37; ld.global.nc.f32 %f105, [%rd40]; ld.global.nc.f32 %f106, [%rd39]; ld.global.nc.f32 %f107, [%rd38]; $L__BB0_45: add.s32 %r31, %r2, 1; @%p42 bra $L__BB0_47; bra.uni $L__BB0_46; $L__BB0_47: add.s32 %r94, %r58, -1; min.s32 %r124, %r31, %r94; bra.uni $L__BB0_48; $L__BB0_46: rem.s32 %r92, %r31, %r58; add.s32 %r93, %r92, %r58; rem.s32 %r124, %r93, %r58; $L__BB0_48: setp.ge.s32 %p51, %r31, %r58; mov.f32 %f86, 0f00000000; and.pred %p53, %p51, %p42; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p53 bra $L__BB0_50; add.s32 %r95, %r124, %r4; mad.lo.s32 %r96, %r95, %r57, %r1; mul.wide.s32 %rd41, %r96, 4; add.s64 %rd42, %rd3, %rd41; add.s64 %rd43, %rd2, %rd41; add.s64 %rd44, %rd1, %rd41; ld.global.nc.f32 %f88, [%rd44]; ld.global.nc.f32 %f87, [%rd43]; ld.global.nc.f32 %f86, [%rd42]; $L__BB0_50: add.s32 %r35, %r2, 2; @%p42 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r99, %r58, -1; min.s32 %r125, %r35, %r99; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r97, %r35, %r58; add.s32 %r98, %r97, %r58; rem.s32 %r125, %r98, %r58; $L__BB0_53: add.s32 %r39, %r125, %r4; setp.ge.s32 %p55, %r35, %r58; mov.f32 %f92, 0f00000000; and.pred %p57, %p55, %p42; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p57 bra $L__BB0_55; mad.lo.s32 %r100, %r39, %r57, %r1; mul.wide.s32 %rd45, %r100, 4; add.s64 %rd46, %rd3, %rd45; add.s64 %rd47, %rd2, %rd45; add.s64 %rd48, %rd1, %rd45; ld.global.nc.f32 %f94, [%rd48]; ld.global.nc.f32 %f93, [%rd47]; ld.global.nc.f32 %f92, [%rd46]; $L__BB0_55: mul.f32 %f288, %f87, %f87; fma.rn.f32 %f289, %f86, %f86, %f288; fma.rn.f32 %f98, %f88, %f88, %f289; setp.eq.f32 %p58, %f98, 0f00000000; @%p58 bra $L__BB0_56; bra.uni $L__BB0_57; $L__BB0_56: mul.f32 %f293, %f106, %f106; fma.rn.f32 %f294, %f107, %f107, %f293; fma.rn.f32 %f295, %f105, %f105, %f294; setp.eq.f32 %p59, %f295, 0f00000000; mov.f32 %f466, 0f00000000; mov.f32 %f467, %f466; mov.f32 %f468, %f466; @%p59 bra $L__BB0_69; $L__BB0_57: mul.f32 %f296, %f75, %f75; fma.rn.f32 %f297, %f74, %f74, %f296; fma.rn.f32 %f111, %f76, %f76, %f297; setp.neu.f32 %p60, %f111, 0f00000000; mul.f32 %f298, %f93, %f93; fma.rn.f32 %f299, %f92, %f92, %f298; fma.rn.f32 %f115, %f94, %f94, %f299; setp.neu.f32 %p61, %f115, 0f00000000; and.pred %p62, %p60, %p61; or.pred %p64, %p58, %p62; @%p64 bra $L__BB0_59; mul.f32 %f300, %f106, %f106; fma.rn.f32 %f301, %f107, %f107, %f300; fma.rn.f32 %f302, %f105, %f105, %f301; setp.neu.f32 %p65, %f302, 0f00000000; @%p65 bra $L__BB0_68; bra.uni $L__BB0_59; $L__BB0_68: sub.f32 %f335, %f86, %f107; mul.f32 %f468, %f335, 0f3F000000; sub.f32 %f336, %f87, %f106; mul.f32 %f467, %f336, 0f3F000000; sub.f32 %f337, %f88, %f105; mul.f32 %f466, %f337, 0f3F000000; bra.uni $L__BB0_69; $L__BB0_59: setp.eq.f32 %p66, %f111, 0f00000000; and.pred %p68, %p66, %p58; @%p68 bra $L__BB0_67; bra.uni $L__BB0_60; $L__BB0_67: sub.f32 %f468, %f1, %f107; sub.f32 %f467, %f2, %f106; sub.f32 %f466, %f3, %f105; bra.uni $L__BB0_69; $L__BB0_60: setp.eq.f32 %p69, %f115, 0f00000000; mul.f32 %f303, %f106, %f106; fma.rn.f32 %f304, %f107, %f107, %f303; fma.rn.f32 %f116, %f105, %f105, %f304; setp.eq.f32 %p70, %f116, 0f00000000; and.pred %p71, %p70, %p69; @%p71 bra $L__BB0_66; bra.uni $L__BB0_61; $L__BB0_66: sub.f32 %f468, %f86, %f1; sub.f32 %f467, %f87, %f2; sub.f32 %f466, %f88, %f3; bra.uni $L__BB0_69; $L__BB0_61: setp.neu.f32 %p73, %f98, 0f00000000; or.pred %p74, %p66, %p73; @%p74 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: setp.neu.f32 %p75, %f116, 0f00000000; or.pred %p77, %p69, %p75; @%p77 bra $L__BB0_65; bra.uni $L__BB0_64; $L__BB0_65: sub.f32 %f326, %f86, %f107; sub.f32 %f327, %f87, %f106; sub.f32 %f328, %f88, %f105; sub.f32 %f329, %f74, %f92; mul.f32 %f330, %f329, 0f3DAAAAAB; sub.f32 %f331, %f75, %f93; mul.f32 %f332, %f331, 0f3DAAAAAB; sub.f32 %f333, %f76, %f94; mul.f32 %f334, %f333, 0f3DAAAAAB; fma.rn.f32 %f468, %f326, 0f3F2AAAAB, %f330; fma.rn.f32 %f467, %f327, 0f3F2AAAAB, %f332; fma.rn.f32 %f466, %f328, 0f3F2AAAAB, %f334; bra.uni $L__BB0_69; $L__BB0_62: mul.f32 %f305, %f74, 0f3F000000; add.f32 %f306, %f107, %f107; sub.f32 %f307, %f305, %f306; add.f32 %f308, %f106, %f106; mul.f32 %f309, %f75, 0f3F000000; sub.f32 %f310, %f309, %f308; add.f32 %f311, %f105, %f105; mul.f32 %f312, %f76, 0f3F000000; sub.f32 %f313, %f312, %f311; fma.rn.f32 %f468, %f1, 0f3FC00000, %f307; fma.rn.f32 %f467, %f2, 0f3FC00000, %f310; fma.rn.f32 %f466, %f3, 0f3FC00000, %f313; bra.uni $L__BB0_69; $L__BB0_64: mul.f32 %f314, %f92, 0f3F000000; add.f32 %f315, %f86, %f86; sub.f32 %f316, %f315, %f314; add.f32 %f317, %f87, %f87; mul.f32 %f318, %f93, 0f3F000000; sub.f32 %f319, %f317, %f318; add.f32 %f320, %f88, %f88; mul.f32 %f321, %f94, 0f3F000000; sub.f32 %f322, %f320, %f321; mul.f32 %f323, %f1, 0f3FC00000; sub.f32 %f468, %f316, %f323; mul.f32 %f324, %f2, 0f3FC00000; sub.f32 %f467, %f319, %f324; mul.f32 %f325, %f3, 0f3FC00000; sub.f32 %f466, %f322, %f325; $L__BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p78, %rs3, 0; add.s32 %r40, %r3, -2; @%p78 bra $L__BB0_71; bra.uni $L__BB0_70; $L__BB0_71: max.s32 %r126, %r40, 0; bra.uni $L__BB0_72; $L__BB0_70: rem.s32 %r101, %r40, %r59; add.s32 %r102, %r101, %r59; rem.s32 %r126, %r102, %r59; $L__BB0_72: setp.lt.s32 %p80, %r3, 2; mov.f32 %f141, 0f00000000; and.pred %p81, %p80, %p78; mov.f32 %f142, %f141; mov.f32 %f143, %f141; @%p81 bra $L__BB0_74; mad.lo.s32 %r103, %r126, %r58, %r2; mad.lo.s32 %r104, %r103, %r57, %r1; mul.wide.s32 %rd49, %r104, 4; add.s64 %rd50, %rd3, %rd49; add.s64 %rd51, %rd2, %rd49; add.s64 %rd52, %rd1, %rd49; ld.global.nc.f32 %f143, [%rd52]; ld.global.nc.f32 %f142, [%rd51]; ld.global.nc.f32 %f141, [%rd50]; $L__BB0_74: add.s32 %r44, %r3, -1; @%p78 bra $L__BB0_76; bra.uni $L__BB0_75; $L__BB0_76: max.s32 %r127, %r44, 0; bra.uni $L__BB0_77; $L__BB0_75: rem.s32 %r105, %r44, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r127, %r106, %r59; $L__BB0_77: setp.lt.s32 %p83, %r3, 1; mov.f32 %f174, 0f00000000; and.pred %p85, %p83, %p78; mov.f32 %f173, %f174; mov.f32 %f172, %f174; @%p85 bra $L__BB0_79; mad.lo.s32 %r107, %r127, %r58, %r2; mad.lo.s32 %r108, %r107, %r57, %r1; mul.wide.s32 %rd53, %r108, 4; add.s64 %rd54, %rd3, %rd53; add.s64 %rd55, %rd2, %rd53; add.s64 %rd56, %rd1, %rd53; ld.global.nc.f32 %f172, [%rd56]; ld.global.nc.f32 %f173, [%rd55]; ld.global.nc.f32 %f174, [%rd54]; $L__BB0_79: add.s32 %r48, %r3, 1; @%p78 bra $L__BB0_81; bra.uni $L__BB0_80; $L__BB0_81: add.s32 %r111, %r59, -1; min.s32 %r128, %r48, %r111; bra.uni $L__BB0_82; $L__BB0_80: rem.s32 %r109, %r48, %r59; add.s32 %r110, %r109, %r59; rem.s32 %r128, %r110, %r59; $L__BB0_82: setp.ge.s32 %p87, %r48, %r59; mov.f32 %f153, 0f00000000; and.pred %p89, %p87, %p78; mov.f32 %f154, %f153; mov.f32 %f155, %f153; @%p89 bra $L__BB0_84; mad.lo.s32 %r112, %r128, %r58, %r2; mad.lo.s32 %r113, %r112, %r57, %r1; mul.wide.s32 %rd57, %r113, 4; add.s64 %rd58, %rd3, %rd57; add.s64 %rd59, %rd2, %rd57; add.s64 %rd60, %rd1, %rd57; ld.global.nc.f32 %f155, [%rd60]; ld.global.nc.f32 %f154, [%rd59]; ld.global.nc.f32 %f153, [%rd58]; $L__BB0_84: add.s32 %r52, %r3, 2; @%p78 bra $L__BB0_86; bra.uni $L__BB0_85; $L__BB0_86: add.s32 %r116, %r59, -1; min.s32 %r129, %r52, %r116; bra.uni $L__BB0_87; $L__BB0_85: rem.s32 %r114, %r52, %r59; add.s32 %r115, %r114, %r59; rem.s32 %r129, %r115, %r59; $L__BB0_87: mad.lo.s32 %r117, %r129, %r58, %r2; mad.lo.s32 %r56, %r117, %r57, %r1; setp.ge.s32 %p91, %r52, %r59; mov.f32 %f159, 0f00000000; and.pred %p93, %p91, %p78; mov.f32 %f160, %f159; mov.f32 %f161, %f159; @%p93 bra $L__BB0_89; mul.wide.s32 %rd61, %r56, 4; add.s64 %rd62, %rd3, %rd61; add.s64 %rd63, %rd2, %rd61; add.s64 %rd64, %rd1, %rd61; ld.global.nc.f32 %f161, [%rd64]; ld.global.nc.f32 %f160, [%rd63]; ld.global.nc.f32 %f159, [%rd62]; $L__BB0_89: mul.f32 %f350, %f154, %f154; fma.rn.f32 %f351, %f153, %f153, %f350; fma.rn.f32 %f165, %f155, %f155, %f351; setp.eq.f32 %p94, %f165, 0f00000000; @%p94 bra $L__BB0_90; bra.uni $L__BB0_91; $L__BB0_90: mul.f32 %f355, %f173, %f173; fma.rn.f32 %f356, %f174, %f174, %f355; fma.rn.f32 %f357, %f172, %f172, %f356; setp.eq.f32 %p95, %f357, 0f00000000; mov.f32 %f484, 0f00000000; mov.f32 %f485, %f484; mov.f32 %f486, %f484; @%p95 bra $L__BB0_103; $L__BB0_91: mul.f32 %f358, %f142, %f142; fma.rn.f32 %f359, %f141, %f141, %f358; fma.rn.f32 %f178, %f143, %f143, %f359; setp.neu.f32 %p96, %f178, 0f00000000; mul.f32 %f360, %f160, %f160; fma.rn.f32 %f361, %f159, %f159, %f360; fma.rn.f32 %f182, %f161, %f161, %f361; setp.neu.f32 %p97, %f182, 0f00000000; and.pred %p98, %p96, %p97; or.pred %p100, %p94, %p98; @%p100 bra $L__BB0_93; mul.f32 %f362, %f173, %f173; fma.rn.f32 %f363, %f174, %f174, %f362; fma.rn.f32 %f364, %f172, %f172, %f363; setp.neu.f32 %p101, %f364, 0f00000000; @%p101 bra $L__BB0_102; bra.uni $L__BB0_93; $L__BB0_102: sub.f32 %f397, %f153, %f174; mul.f32 %f486, %f397, 0f3F000000; sub.f32 %f398, %f154, %f173; mul.f32 %f485, %f398, 0f3F000000; sub.f32 %f399, %f155, %f172; mul.f32 %f484, %f399, 0f3F000000; bra.uni $L__BB0_103; $L__BB0_93: setp.eq.f32 %p102, %f178, 0f00000000; and.pred %p104, %p102, %p94; @%p104 bra $L__BB0_101; bra.uni $L__BB0_94; $L__BB0_101: sub.f32 %f486, %f1, %f174; sub.f32 %f485, %f2, %f173; sub.f32 %f484, %f3, %f172; bra.uni $L__BB0_103; $L__BB0_94: setp.eq.f32 %p105, %f182, 0f00000000; mul.f32 %f365, %f173, %f173; fma.rn.f32 %f366, %f174, %f174, %f365; fma.rn.f32 %f183, %f172, %f172, %f366; setp.eq.f32 %p106, %f183, 0f00000000; and.pred %p107, %p106, %p105; @%p107 bra $L__BB0_100; bra.uni $L__BB0_95; $L__BB0_100: sub.f32 %f486, %f153, %f1; sub.f32 %f485, %f154, %f2; sub.f32 %f484, %f155, %f3; bra.uni $L__BB0_103; $L__BB0_95: setp.neu.f32 %p109, %f165, 0f00000000; or.pred %p110, %p102, %p109; @%p110 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.neu.f32 %p111, %f183, 0f00000000; or.pred %p113, %p105, %p111; @%p113 bra $L__BB0_99; bra.uni $L__BB0_98; $L__BB0_99: sub.f32 %f388, %f153, %f174; sub.f32 %f389, %f154, %f173; sub.f32 %f390, %f155, %f172; sub.f32 %f391, %f141, %f159; mul.f32 %f392, %f391, 0f3DAAAAAB; sub.f32 %f393, %f142, %f160; mul.f32 %f394, %f393, 0f3DAAAAAB; sub.f32 %f395, %f143, %f161; mul.f32 %f396, %f395, 0f3DAAAAAB; fma.rn.f32 %f486, %f388, 0f3F2AAAAB, %f392; fma.rn.f32 %f485, %f389, 0f3F2AAAAB, %f394; fma.rn.f32 %f484, %f390, 0f3F2AAAAB, %f396; bra.uni $L__BB0_103; $L__BB0_96: mul.f32 %f367, %f141, 0f3F000000; add.f32 %f368, %f174, %f174; sub.f32 %f369, %f367, %f368; add.f32 %f370, %f173, %f173; mul.f32 %f371, %f142, 0f3F000000; sub.f32 %f372, %f371, %f370; add.f32 %f373, %f172, %f172; mul.f32 %f374, %f143, 0f3F000000; sub.f32 %f375, %f374, %f373; fma.rn.f32 %f486, %f1, 0f3FC00000, %f369; fma.rn.f32 %f485, %f2, 0f3FC00000, %f372; fma.rn.f32 %f484, %f3, 0f3FC00000, %f375; bra.uni $L__BB0_103; $L__BB0_98: mul.f32 %f376, %f159, 0f3F000000; add.f32 %f377, %f153, %f153; sub.f32 %f378, %f377, %f376; add.f32 %f379, %f154, %f154; mul.f32 %f380, %f160, 0f3F000000; sub.f32 %f381, %f379, %f380; add.f32 %f382, %f155, %f155; mul.f32 %f383, %f161, 0f3F000000; sub.f32 %f384, %f382, %f383; mul.f32 %f385, %f1, 0f3FC00000; sub.f32 %f486, %f378, %f385; mul.f32 %f386, %f2, 0f3FC00000; sub.f32 %f485, %f381, %f386; mul.f32 %f387, %f3, 0f3FC00000; sub.f32 %f484, %f384, %f387; $L__BB0_103: setp.eq.s64 %p114, %rd8, 0; @%p114 bra $L__BB0_105; cvta.to.global.u64 %rd65, %rd8; shl.b64 %rd66, %rd4, 2; add.s64 %rd67, %rd65, %rd66; ld.global.nc.f32 %f400, [%rd67]; mul.f32 %f487, %f400, %f487; $L__BB0_105: setp.eq.s64 %p115, %rd9, 0; @%p115 bra $L__BB0_107; cvta.to.global.u64 %rd68, %rd9; shl.b64 %rd69, %rd4, 2; add.s64 %rd70, %rd68, %rd69; ld.global.nc.f32 %f401, [%rd70]; mul.f32 %f488, %f401, %f488; $L__BB0_107: mul.f32 %f402, %f450, %f211; mul.f32 %f403, %f467, %f212; mul.f32 %f404, %f484, %f213; add.f32 %f405, %f487, %f487; mul.f32 %f406, %f1, %f405; add.f32 %f407, %f403, %f404; mul.f32 %f408, %f1, %f407; mul.f32 %f409, %f468, %f212; fma.rn.f32 %f410, %f2, %f409, %f408; mul.f32 %f411, %f486, %f213; fma.rn.f32 %f412, %f3, %f411, %f410; mul.f32 %f413, %f412, %f488; fma.rn.f32 %f414, %f402, %f406, %f413; cvta.to.global.u64 %rd71, %rd5; shl.b64 %rd72, %rd4, 2; add.s64 %rd73, %rd71, %rd72; st.global.f32 [%rd73], %f414; mul.f32 %f415, %f2, %f405; add.f32 %f416, %f402, %f404; mul.f32 %f417, %f2, %f416; mul.f32 %f418, %f449, %f211; fma.rn.f32 %f419, %f1, %f418, %f417; mul.f32 %f420, %f485, %f213; fma.rn.f32 %f421, %f3, %f420, %f419; mul.f32 %f422, %f421, %f488; fma.rn.f32 %f423, %f403, %f415, %f422; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd75, %rd74, %rd72; st.global.f32 [%rd75], %f423; mul.f32 %f424, %f3, %f405; mul.f32 %f425, %f466, %f212; mul.f32 %f426, %f2, %f425; mul.f32 %f427, %f448, %f211; fma.rn.f32 %f428, %f1, %f427, %f426; add.f32 %f429, %f402, %f403; fma.rn.f32 %f430, %f3, %f429, %f428; mul.f32 %f431, %f430, %f488; fma.rn.f32 %f432, %f404, %f424, %f431; cvta.to.global.u64 %rd76, %rd7; add.s64 %rd77, %rd76, %rd72; st.global.f32 [%rd77], %f432; $L__BB0_108: ret; } ` ) 3-3.11.1/cuda/maxangle.cu000066400000000000000000000052561503346766200150240ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" // See maxangle.go for more details. extern "C" __global__ void setmaxangle(float* __restrict__ dst, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ aLUT2d, uint8_t* __restrict__ regions, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } // central cell int I = idx(ix, iy, iz); float3 m0 = make_float3(mx[I], my[I], mz[I]); if (is0(m0)) { return; } uint8_t r0 = regions[I]; float angle = 0.0f; int i_; // neighbor index float3 m_; // neighbor mag float a__; // inter-cell exchange stiffness // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // clamps or wraps index according to PBC m_ = make_float3(mx[i_], my[i_], mz[i_]); // load m m_ = ( is0(m_)? m0: m_ ); // replace missing non-boundary neighbor a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // right neighbor i_ = idx(hclampx(ix+1), iy, iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // back neighbor i_ = idx(ix, lclampy(iy-1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // front neighbor i_ = idx(ix, hclampy(iy+1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor i_ = idx(ix, iy, lclampz(iz-1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // top neighbor i_ = idx(ix, iy, hclampz(iz+1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } } dst[I] = angle; } 3-3.11.1/cuda/maxangle.go000066400000000000000000000010241503346766200150070ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" ) // SetMaxAngle sets dst to the maximum angle of each cells magnetization with all of its neighbors, // provided the exchange stiffness with that neighbor is nonzero. func SetMaxAngle(dst, m *data.Slice, Aex_red SymmLUT, regions *Bytes, mesh *data.Mesh) { N := mesh.Size() pbc := mesh.PBC_code() cfg := make3DConf(N) k_setmaxangle_async(dst.DevPtr(0), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), unsafe.Pointer(Aex_red), regions.Ptr, N[X], N[Y], N[Z], pbc, cfg) } 3-3.11.1/cuda/maxangle_wrapper.go000066400000000000000000010332611503346766200165600ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setmaxangle kernel var setmaxangle_code cu.Function // Stores the arguments for setmaxangle kernel invocation type setmaxangle_args_t struct { arg_dst unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_aLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for setmaxangle kernel invocation var setmaxangle_args setmaxangle_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setmaxangle_args.argptr[0] = unsafe.Pointer(&setmaxangle_args.arg_dst) setmaxangle_args.argptr[1] = unsafe.Pointer(&setmaxangle_args.arg_mx) setmaxangle_args.argptr[2] = unsafe.Pointer(&setmaxangle_args.arg_my) setmaxangle_args.argptr[3] = unsafe.Pointer(&setmaxangle_args.arg_mz) setmaxangle_args.argptr[4] = unsafe.Pointer(&setmaxangle_args.arg_aLUT2d) setmaxangle_args.argptr[5] = unsafe.Pointer(&setmaxangle_args.arg_regions) setmaxangle_args.argptr[6] = unsafe.Pointer(&setmaxangle_args.arg_Nx) setmaxangle_args.argptr[7] = unsafe.Pointer(&setmaxangle_args.arg_Ny) setmaxangle_args.argptr[8] = unsafe.Pointer(&setmaxangle_args.arg_Nz) setmaxangle_args.argptr[9] = unsafe.Pointer(&setmaxangle_args.arg_PBC) } // Wrapper for setmaxangle CUDA kernel, asynchronous. func k_setmaxangle_async(dst unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("setmaxangle") } setmaxangle_args.Lock() defer setmaxangle_args.Unlock() if setmaxangle_code == 0 { setmaxangle_code = fatbinLoad(setmaxangle_map, "setmaxangle") } setmaxangle_args.arg_dst = dst setmaxangle_args.arg_mx = mx setmaxangle_args.arg_my = my setmaxangle_args.arg_mz = mz setmaxangle_args.arg_aLUT2d = aLUT2d setmaxangle_args.arg_regions = regions setmaxangle_args.arg_Nx = Nx setmaxangle_args.arg_Ny = Ny setmaxangle_args.arg_Nz = Nz setmaxangle_args.arg_PBC = PBC args := setmaxangle_args.argptr[:] cu.LaunchKernel(setmaxangle_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setmaxangle") } } // maps compute capability on PTX code for setmaxangle kernel. var setmaxangle_map = map[int]string{0: "", 50: setmaxangle_ptx_50, 52: setmaxangle_ptx_52, 53: setmaxangle_ptx_53, 60: setmaxangle_ptx_60, 61: setmaxangle_ptx_61, 62: setmaxangle_ptx_62, 70: setmaxangle_ptx_70, 72: setmaxangle_ptx_72, 75: setmaxangle_ptx_75, 80: setmaxangle_ptx_80, 86: setmaxangle_ptx_86, 87: setmaxangle_ptx_87, 89: setmaxangle_ptx_89, 90: setmaxangle_ptx_90} // setmaxangle PTX code for various compute capabilities. const ( setmaxangle_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` setmaxangle_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<36>; .reg .f32 %f<318>; .reg .b32 %r<135>; .reg .b64 %rd<69>; ld.param.u8 %rs5, [setmaxangle_param_9]; ld.param.u64 %rd7, [setmaxangle_param_0]; ld.param.u64 %rd8, [setmaxangle_param_1]; ld.param.u64 %rd9, [setmaxangle_param_2]; ld.param.u64 %rd10, [setmaxangle_param_3]; ld.param.u64 %rd11, [setmaxangle_param_4]; ld.param.u64 %rd12, [setmaxangle_param_5]; ld.param.u32 %r30, [setmaxangle_param_6]; ld.param.u32 %r31, [setmaxangle_param_7]; ld.param.u32 %r32, [setmaxangle_param_8]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd3, %rd10; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r33, %ntid.x; mov.u32 %r34, %ctaid.x; mov.u32 %r35, %tid.x; mad.lo.s32 %r1, %r34, %r33, %r35; mov.u32 %r36, %ntid.y; mov.u32 %r37, %ctaid.y; mov.u32 %r38, %tid.y; mad.lo.s32 %r2, %r37, %r36, %r38; mov.u32 %r39, %ntid.z; mov.u32 %r40, %ctaid.z; mov.u32 %r41, %tid.z; mad.lo.s32 %r3, %r40, %r39, %r41; setp.ge.s32 %p1, %r1, %r30; setp.ge.s32 %p2, %r2, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r32; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_34; mul.lo.s32 %r4, %r3, %r31; add.s32 %r42, %r4, %r2; mul.lo.s32 %r5, %r42, %r30; add.s32 %r43, %r5, %r1; cvt.s64.s32 %rd6, %r43; mul.wide.s32 %rd13, %r43, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd15, %rd4, %rd13; add.s64 %rd16, %rd3, %rd13; ld.global.nc.f32 %f1, [%rd14]; ld.global.nc.f32 %f2, [%rd15]; ld.global.nc.f32 %f3, [%rd16]; mul.f32 %f37, %f2, %f2; fma.rn.f32 %f38, %f1, %f1, %f37; fma.rn.f32 %f39, %f3, %f3, %f38; setp.eq.f32 %p6, %f39, 0f00000000; @%p6 bra $L__BB0_34; add.s64 %rd17, %rd2, %rd6; ld.global.nc.u8 %rs1, [%rd17]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r6, %r1, -1; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r129, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r44, %r6, %r30; add.s32 %r45, %r44, %r30; rem.s32 %r129, %r45, %r30; $L__BB0_5: add.s32 %r46, %r129, %r5; cvt.s64.s32 %rd18, %r46; mul.wide.s32 %rd19, %r46, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f41, [%rd22]; ld.global.nc.f32 %f42, [%rd20]; ld.global.nc.f32 %f43, [%rd21]; mul.f32 %f44, %f43, %f43; fma.rn.f32 %f45, %f42, %f42, %f44; fma.rn.f32 %f46, %f41, %f41, %f45; setp.eq.f32 %p8, %f46, 0f00000000; mov.f32 %f313, 0f00000000; selp.f32 %f9, %f3, %f41, %p8; selp.f32 %f8, %f2, %f43, %p8; selp.f32 %f7, %f1, %f42, %p8; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; min.u16 %rs9, %rs6, %rs1; max.u16 %rs10, %rs6, %rs1; cvt.u32.u16 %r47, %rs10; add.s32 %r48, %r47, 1; mul.lo.s32 %r49, %r48, %r47; shr.u32 %r50, %r49, 1; cvt.u32.u16 %r51, %rs9; add.s32 %r52, %r50, %r51; mul.wide.s32 %rd24, %r52, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f47, [%rd25]; setp.eq.f32 %p9, %f47, 0f00000000; @%p9 bra $L__BB0_7; mul.f32 %f48, %f2, %f8; fma.rn.f32 %f49, %f1, %f7, %f48; fma.rn.f32 %f50, %f3, %f9, %f49; abs.f32 %f51, %f50; neg.f32 %f52, %f51; mov.f32 %f53, 0f3F000000; fma.rn.f32 %f54, %f53, %f52, %f53; rsqrt.approx.ftz.f32 %f55, %f54; mul.f32 %f56, %f54, %f55; mul.f32 %f57, %f55, 0f3F000000; neg.f32 %f58, %f56; fma.rn.f32 %f59, %f58, %f57, %f53; fma.rn.f32 %f60, %f56, %f59, %f56; setp.eq.f32 %p10, %f51, 0f3F800000; selp.f32 %f61, 0f00000000, %f60, %p10; mov.f32 %f62, 0f00000000; setp.gt.f32 %p11, %f51, 0f3F0F5C29; selp.f32 %f63, %f61, %f51, %p11; mov.b32 %r53, %f63; mov.b32 %r54, %f50; and.b32 %r55, %r54, -2147483648; or.b32 %r56, %r55, %r53; mov.b32 %f64, %r56; mul.f32 %f65, %f64, %f64; mov.f32 %f66, 0f3C8B1ABB; mov.f32 %f67, 0f3D10ECEF; fma.rn.f32 %f68, %f67, %f65, %f66; mov.f32 %f69, 0f3CFC028C; fma.rn.f32 %f70, %f68, %f65, %f69; mov.f32 %f71, 0f3D372139; fma.rn.f32 %f72, %f70, %f65, %f71; mov.f32 %f73, 0f3D9993DB; fma.rn.f32 %f74, %f72, %f65, %f73; mov.f32 %f75, 0f3E2AAAC6; fma.rn.f32 %f76, %f74, %f65, %f75; mul.f32 %f77, %f76, %f65; fma.rn.f32 %f78, %f77, %f64, %f64; neg.f32 %f79, %f78; selp.f32 %f80, %f78, %f79, %p11; mov.f32 %f81, 0f3FD774EB; mov.f32 %f82, 0f3F6EE581; fma.rn.f32 %f83, %f82, %f81, %f80; setp.gt.f32 %p12, %f50, 0f3F0F5C29; selp.f32 %f84, %f78, %f83, %p12; add.f32 %f85, %f84, %f84; selp.f32 %f86, %f85, %f84, %p11; max.f32 %f313, %f62, %f86; $L__BB0_7: add.s32 %r10, %r1, 1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: add.s32 %r59, %r30, -1; min.s32 %r130, %r10, %r59; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r30; add.s32 %r58, %r57, %r30; rem.s32 %r130, %r58, %r30; $L__BB0_10: add.s32 %r60, %r130, %r5; cvt.s64.s32 %rd26, %r60; mul.wide.s32 %rd27, %r60, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f87, [%rd30]; ld.global.nc.f32 %f88, [%rd28]; ld.global.nc.f32 %f89, [%rd29]; mul.f32 %f90, %f89, %f89; fma.rn.f32 %f91, %f88, %f88, %f90; fma.rn.f32 %f92, %f87, %f87, %f91; setp.eq.f32 %p14, %f92, 0f00000000; selp.f32 %f14, %f3, %f87, %p14; selp.f32 %f13, %f2, %f89, %p14; selp.f32 %f12, %f1, %f88, %p14; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs11, [%rd31]; min.u16 %rs14, %rs11, %rs1; max.u16 %rs15, %rs11, %rs1; cvt.u32.u16 %r61, %rs15; add.s32 %r62, %r61, 1; mul.lo.s32 %r63, %r62, %r61; shr.u32 %r64, %r63, 1; cvt.u32.u16 %r65, %rs14; add.s32 %r66, %r64, %r65; mul.wide.s32 %rd32, %r66, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f93, [%rd33]; setp.eq.f32 %p15, %f93, 0f00000000; @%p15 bra $L__BB0_12; mul.f32 %f94, %f2, %f13; fma.rn.f32 %f95, %f1, %f12, %f94; fma.rn.f32 %f96, %f3, %f14, %f95; abs.f32 %f97, %f96; neg.f32 %f98, %f97; mov.f32 %f99, 0f3F000000; fma.rn.f32 %f100, %f99, %f98, %f99; rsqrt.approx.ftz.f32 %f101, %f100; mul.f32 %f102, %f100, %f101; mul.f32 %f103, %f101, 0f3F000000; neg.f32 %f104, %f102; fma.rn.f32 %f105, %f104, %f103, %f99; fma.rn.f32 %f106, %f102, %f105, %f102; setp.eq.f32 %p16, %f97, 0f3F800000; selp.f32 %f107, 0f00000000, %f106, %p16; setp.gt.f32 %p17, %f97, 0f3F0F5C29; selp.f32 %f108, %f107, %f97, %p17; mov.b32 %r67, %f108; mov.b32 %r68, %f96; and.b32 %r69, %r68, -2147483648; or.b32 %r70, %r69, %r67; mov.b32 %f109, %r70; mul.f32 %f110, %f109, %f109; mov.f32 %f111, 0f3C8B1ABB; mov.f32 %f112, 0f3D10ECEF; fma.rn.f32 %f113, %f112, %f110, %f111; mov.f32 %f114, 0f3CFC028C; fma.rn.f32 %f115, %f113, %f110, %f114; mov.f32 %f116, 0f3D372139; fma.rn.f32 %f117, %f115, %f110, %f116; mov.f32 %f118, 0f3D9993DB; fma.rn.f32 %f119, %f117, %f110, %f118; mov.f32 %f120, 0f3E2AAAC6; fma.rn.f32 %f121, %f119, %f110, %f120; mul.f32 %f122, %f121, %f110; fma.rn.f32 %f123, %f122, %f109, %f109; neg.f32 %f124, %f123; selp.f32 %f125, %f123, %f124, %p17; mov.f32 %f126, 0f3FD774EB; mov.f32 %f127, 0f3F6EE581; fma.rn.f32 %f128, %f127, %f126, %f125; setp.gt.f32 %p18, %f96, 0f3F0F5C29; selp.f32 %f129, %f123, %f128, %p18; add.f32 %f130, %f129, %f129; selp.f32 %f131, %f130, %f129, %p17; max.f32 %f313, %f313, %f131; $L__BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r14, %r2, -1; @%p19 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: max.s32 %r131, %r14, 0; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r71, %r14, %r31; add.s32 %r72, %r71, %r31; rem.s32 %r131, %r72, %r31; $L__BB0_15: add.s32 %r73, %r131, %r4; mad.lo.s32 %r74, %r73, %r30, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f132, [%rd38]; ld.global.nc.f32 %f133, [%rd36]; ld.global.nc.f32 %f134, [%rd37]; mul.f32 %f135, %f134, %f134; fma.rn.f32 %f136, %f133, %f133, %f135; fma.rn.f32 %f137, %f132, %f132, %f136; setp.eq.f32 %p20, %f137, 0f00000000; selp.f32 %f19, %f3, %f132, %p20; selp.f32 %f18, %f2, %f134, %p20; selp.f32 %f17, %f1, %f133, %p20; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs16, [%rd39]; min.u16 %rs19, %rs16, %rs1; max.u16 %rs20, %rs16, %rs1; cvt.u32.u16 %r75, %rs20; add.s32 %r76, %r75, 1; mul.lo.s32 %r77, %r76, %r75; shr.u32 %r78, %r77, 1; cvt.u32.u16 %r79, %rs19; add.s32 %r80, %r78, %r79; mul.wide.s32 %rd40, %r80, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f138, [%rd41]; setp.eq.f32 %p21, %f138, 0f00000000; @%p21 bra $L__BB0_17; mul.f32 %f139, %f2, %f18; fma.rn.f32 %f140, %f1, %f17, %f139; fma.rn.f32 %f141, %f3, %f19, %f140; abs.f32 %f142, %f141; neg.f32 %f143, %f142; mov.f32 %f144, 0f3F000000; fma.rn.f32 %f145, %f144, %f143, %f144; rsqrt.approx.ftz.f32 %f146, %f145; mul.f32 %f147, %f145, %f146; mul.f32 %f148, %f146, 0f3F000000; neg.f32 %f149, %f147; fma.rn.f32 %f150, %f149, %f148, %f144; fma.rn.f32 %f151, %f147, %f150, %f147; setp.eq.f32 %p22, %f142, 0f3F800000; selp.f32 %f152, 0f00000000, %f151, %p22; setp.gt.f32 %p23, %f142, 0f3F0F5C29; selp.f32 %f153, %f152, %f142, %p23; mov.b32 %r81, %f153; mov.b32 %r82, %f141; and.b32 %r83, %r82, -2147483648; or.b32 %r84, %r83, %r81; mov.b32 %f154, %r84; mul.f32 %f155, %f154, %f154; mov.f32 %f156, 0f3C8B1ABB; mov.f32 %f157, 0f3D10ECEF; fma.rn.f32 %f158, %f157, %f155, %f156; mov.f32 %f159, 0f3CFC028C; fma.rn.f32 %f160, %f158, %f155, %f159; mov.f32 %f161, 0f3D372139; fma.rn.f32 %f162, %f160, %f155, %f161; mov.f32 %f163, 0f3D9993DB; fma.rn.f32 %f164, %f162, %f155, %f163; mov.f32 %f165, 0f3E2AAAC6; fma.rn.f32 %f166, %f164, %f155, %f165; mul.f32 %f167, %f166, %f155; fma.rn.f32 %f168, %f167, %f154, %f154; neg.f32 %f169, %f168; selp.f32 %f170, %f168, %f169, %p23; mov.f32 %f171, 0f3FD774EB; mov.f32 %f172, 0f3F6EE581; fma.rn.f32 %f173, %f172, %f171, %f170; setp.gt.f32 %p24, %f141, 0f3F0F5C29; selp.f32 %f174, %f168, %f173, %p24; add.f32 %f175, %f174, %f174; selp.f32 %f176, %f175, %f174, %p23; max.f32 %f313, %f313, %f176; $L__BB0_17: add.s32 %r18, %r2, 1; @%p19 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r87, %r31, -1; min.s32 %r132, %r18, %r87; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r85, %r18, %r31; add.s32 %r86, %r85, %r31; rem.s32 %r132, %r86, %r31; $L__BB0_20: add.s32 %r88, %r132, %r4; mad.lo.s32 %r89, %r88, %r30, %r1; cvt.s64.s32 %rd42, %r89; mul.wide.s32 %rd43, %r89, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f177, [%rd46]; ld.global.nc.f32 %f178, [%rd44]; ld.global.nc.f32 %f179, [%rd45]; mul.f32 %f180, %f179, %f179; fma.rn.f32 %f181, %f178, %f178, %f180; fma.rn.f32 %f182, %f177, %f177, %f181; setp.eq.f32 %p26, %f182, 0f00000000; selp.f32 %f24, %f3, %f177, %p26; selp.f32 %f23, %f2, %f179, %p26; selp.f32 %f22, %f1, %f178, %p26; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs21, [%rd47]; min.u16 %rs24, %rs21, %rs1; max.u16 %rs25, %rs21, %rs1; cvt.u32.u16 %r90, %rs25; add.s32 %r91, %r90, 1; mul.lo.s32 %r92, %r91, %r90; shr.u32 %r93, %r92, 1; cvt.u32.u16 %r94, %rs24; add.s32 %r95, %r93, %r94; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f183, [%rd49]; setp.eq.f32 %p27, %f183, 0f00000000; @%p27 bra $L__BB0_22; mul.f32 %f184, %f2, %f23; fma.rn.f32 %f185, %f1, %f22, %f184; fma.rn.f32 %f186, %f3, %f24, %f185; abs.f32 %f187, %f186; neg.f32 %f188, %f187; mov.f32 %f189, 0f3F000000; fma.rn.f32 %f190, %f189, %f188, %f189; rsqrt.approx.ftz.f32 %f191, %f190; mul.f32 %f192, %f190, %f191; mul.f32 %f193, %f191, 0f3F000000; neg.f32 %f194, %f192; fma.rn.f32 %f195, %f194, %f193, %f189; fma.rn.f32 %f196, %f192, %f195, %f192; setp.eq.f32 %p28, %f187, 0f3F800000; selp.f32 %f197, 0f00000000, %f196, %p28; setp.gt.f32 %p29, %f187, 0f3F0F5C29; selp.f32 %f198, %f197, %f187, %p29; mov.b32 %r96, %f198; mov.b32 %r97, %f186; and.b32 %r98, %r97, -2147483648; or.b32 %r99, %r98, %r96; mov.b32 %f199, %r99; mul.f32 %f200, %f199, %f199; mov.f32 %f201, 0f3C8B1ABB; mov.f32 %f202, 0f3D10ECEF; fma.rn.f32 %f203, %f202, %f200, %f201; mov.f32 %f204, 0f3CFC028C; fma.rn.f32 %f205, %f203, %f200, %f204; mov.f32 %f206, 0f3D372139; fma.rn.f32 %f207, %f205, %f200, %f206; mov.f32 %f208, 0f3D9993DB; fma.rn.f32 %f209, %f207, %f200, %f208; mov.f32 %f210, 0f3E2AAAC6; fma.rn.f32 %f211, %f209, %f200, %f210; mul.f32 %f212, %f211, %f200; fma.rn.f32 %f213, %f212, %f199, %f199; neg.f32 %f214, %f213; selp.f32 %f215, %f213, %f214, %p29; mov.f32 %f216, 0f3FD774EB; mov.f32 %f217, 0f3F6EE581; fma.rn.f32 %f218, %f217, %f216, %f215; setp.gt.f32 %p30, %f186, 0f3F0F5C29; selp.f32 %f219, %f213, %f218, %p30; add.f32 %f220, %f219, %f219; selp.f32 %f221, %f220, %f219, %p29; max.f32 %f313, %f313, %f221; $L__BB0_22: setp.eq.s32 %p31, %r32, 1; @%p31 bra $L__BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r22, %r3, -1; @%p32 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: max.s32 %r133, %r22, 0; bra.uni $L__BB0_26; $L__BB0_24: rem.s32 %r100, %r22, %r32; add.s32 %r101, %r100, %r32; rem.s32 %r133, %r101, %r32; $L__BB0_26: mad.lo.s32 %r102, %r133, %r31, %r2; mad.lo.s32 %r103, %r102, %r30, %r1; cvt.s64.s32 %rd50, %r103; mul.wide.s32 %rd51, %r103, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f222, [%rd54]; ld.global.nc.f32 %f223, [%rd52]; ld.global.nc.f32 %f224, [%rd53]; mul.f32 %f225, %f224, %f224; fma.rn.f32 %f226, %f223, %f223, %f225; fma.rn.f32 %f227, %f222, %f222, %f226; setp.eq.f32 %p33, %f227, 0f00000000; selp.f32 %f29, %f3, %f222, %p33; selp.f32 %f28, %f2, %f224, %p33; selp.f32 %f27, %f1, %f223, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs26, [%rd55]; min.u16 %rs29, %rs26, %rs1; max.u16 %rs30, %rs26, %rs1; cvt.u32.u16 %r104, %rs30; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; cvt.u32.u16 %r108, %rs29; add.s32 %r109, %r107, %r108; mul.wide.s32 %rd56, %r109, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f228, [%rd57]; setp.eq.f32 %p34, %f228, 0f00000000; @%p34 bra $L__BB0_28; mul.f32 %f229, %f2, %f28; fma.rn.f32 %f230, %f1, %f27, %f229; fma.rn.f32 %f231, %f3, %f29, %f230; abs.f32 %f232, %f231; neg.f32 %f233, %f232; mov.f32 %f234, 0f3F000000; fma.rn.f32 %f235, %f234, %f233, %f234; rsqrt.approx.ftz.f32 %f236, %f235; mul.f32 %f237, %f235, %f236; mul.f32 %f238, %f236, 0f3F000000; neg.f32 %f239, %f237; fma.rn.f32 %f240, %f239, %f238, %f234; fma.rn.f32 %f241, %f237, %f240, %f237; setp.eq.f32 %p35, %f232, 0f3F800000; selp.f32 %f242, 0f00000000, %f241, %p35; setp.gt.f32 %p36, %f232, 0f3F0F5C29; selp.f32 %f243, %f242, %f232, %p36; mov.b32 %r110, %f243; mov.b32 %r111, %f231; and.b32 %r112, %r111, -2147483648; or.b32 %r113, %r112, %r110; mov.b32 %f244, %r113; mul.f32 %f245, %f244, %f244; mov.f32 %f246, 0f3C8B1ABB; mov.f32 %f247, 0f3D10ECEF; fma.rn.f32 %f248, %f247, %f245, %f246; mov.f32 %f249, 0f3CFC028C; fma.rn.f32 %f250, %f248, %f245, %f249; mov.f32 %f251, 0f3D372139; fma.rn.f32 %f252, %f250, %f245, %f251; mov.f32 %f253, 0f3D9993DB; fma.rn.f32 %f254, %f252, %f245, %f253; mov.f32 %f255, 0f3E2AAAC6; fma.rn.f32 %f256, %f254, %f245, %f255; mul.f32 %f257, %f256, %f245; fma.rn.f32 %f258, %f257, %f244, %f244; neg.f32 %f259, %f258; selp.f32 %f260, %f258, %f259, %p36; mov.f32 %f261, 0f3FD774EB; mov.f32 %f262, 0f3F6EE581; fma.rn.f32 %f263, %f262, %f261, %f260; setp.gt.f32 %p37, %f231, 0f3F0F5C29; selp.f32 %f264, %f258, %f263, %p37; add.f32 %f265, %f264, %f264; selp.f32 %f266, %f265, %f264, %p36; max.f32 %f313, %f313, %f266; $L__BB0_28: add.s32 %r26, %r3, 1; @%p32 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: add.s32 %r116, %r32, -1; min.s32 %r134, %r26, %r116; bra.uni $L__BB0_31; $L__BB0_29: rem.s32 %r114, %r26, %r32; add.s32 %r115, %r114, %r32; rem.s32 %r134, %r115, %r32; $L__BB0_31: mad.lo.s32 %r117, %r134, %r31, %r2; mad.lo.s32 %r118, %r117, %r30, %r1; cvt.s64.s32 %rd58, %r118; mul.wide.s32 %rd59, %r118, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f267, [%rd62]; ld.global.nc.f32 %f268, [%rd60]; ld.global.nc.f32 %f269, [%rd61]; mul.f32 %f270, %f269, %f269; fma.rn.f32 %f271, %f268, %f268, %f270; fma.rn.f32 %f272, %f267, %f267, %f271; setp.eq.f32 %p39, %f272, 0f00000000; selp.f32 %f34, %f3, %f267, %p39; selp.f32 %f33, %f2, %f269, %p39; selp.f32 %f32, %f1, %f268, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs31, [%rd63]; min.u16 %rs34, %rs31, %rs1; max.u16 %rs35, %rs31, %rs1; cvt.u32.u16 %r119, %rs35; add.s32 %r120, %r119, 1; mul.lo.s32 %r121, %r120, %r119; shr.u32 %r122, %r121, 1; cvt.u32.u16 %r123, %rs34; add.s32 %r124, %r122, %r123; mul.wide.s32 %rd64, %r124, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f273, [%rd65]; setp.eq.f32 %p40, %f273, 0f00000000; @%p40 bra $L__BB0_33; mul.f32 %f274, %f2, %f33; fma.rn.f32 %f275, %f1, %f32, %f274; fma.rn.f32 %f276, %f3, %f34, %f275; abs.f32 %f277, %f276; neg.f32 %f278, %f277; mov.f32 %f279, 0f3F000000; fma.rn.f32 %f280, %f279, %f278, %f279; rsqrt.approx.ftz.f32 %f281, %f280; mul.f32 %f282, %f280, %f281; mul.f32 %f283, %f281, 0f3F000000; neg.f32 %f284, %f282; fma.rn.f32 %f285, %f284, %f283, %f279; fma.rn.f32 %f286, %f282, %f285, %f282; setp.eq.f32 %p41, %f277, 0f3F800000; selp.f32 %f287, 0f00000000, %f286, %p41; setp.gt.f32 %p42, %f277, 0f3F0F5C29; selp.f32 %f288, %f287, %f277, %p42; mov.b32 %r125, %f288; mov.b32 %r126, %f276; and.b32 %r127, %r126, -2147483648; or.b32 %r128, %r127, %r125; mov.b32 %f289, %r128; mul.f32 %f290, %f289, %f289; mov.f32 %f291, 0f3C8B1ABB; mov.f32 %f292, 0f3D10ECEF; fma.rn.f32 %f293, %f292, %f290, %f291; mov.f32 %f294, 0f3CFC028C; fma.rn.f32 %f295, %f293, %f290, %f294; mov.f32 %f296, 0f3D372139; fma.rn.f32 %f297, %f295, %f290, %f296; mov.f32 %f298, 0f3D9993DB; fma.rn.f32 %f299, %f297, %f290, %f298; mov.f32 %f300, 0f3E2AAAC6; fma.rn.f32 %f301, %f299, %f290, %f300; mul.f32 %f302, %f301, %f290; fma.rn.f32 %f303, %f302, %f289, %f289; neg.f32 %f304, %f303; selp.f32 %f305, %f303, %f304, %p42; mov.f32 %f306, 0f3FD774EB; mov.f32 %f307, 0f3F6EE581; fma.rn.f32 %f308, %f307, %f306, %f305; setp.gt.f32 %p43, %f276, 0f3F0F5C29; selp.f32 %f309, %f303, %f308, %p43; add.f32 %f310, %f309, %f309; selp.f32 %f311, %f310, %f309, %p42; max.f32 %f313, %f313, %f311; $L__BB0_33: cvta.to.global.u64 %rd66, %rd7; shl.b64 %rd67, %rd6, 2; add.s64 %rd68, %rd66, %rd67; st.global.f32 [%rd68], %f313; $L__BB0_34: ret; } ` ) 3-3.11.1/cuda/minimize.cu000066400000000000000000000014531503346766200150440ustar00rootroot00000000000000#include #include "float3.h" // Steepest descent energy minimizer extern "C" __global__ void minimize(float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ m0x, float* __restrict__ m0y, float* __restrict__ m0z, float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float dt, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m0 = {m0x[i], m0y[i], m0z[i]}; float3 t = {tx[i], ty[i], tz[i]}; float t2 = dt*dt*dot(t, t); float3 result = (4 - t2) * m0 + 4 * dt * t; float divisor = 4 + t2; mx[i] = result.x / divisor; my[i] = result.y / divisor; mz[i] = result.z / divisor; } } 3-3.11.1/cuda/minimize.go000066400000000000000000000006711503346766200150430ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // m = 1 / (4 + τ²(m x H)²) [{4 - τ²(m x H)²} m - 4τ(m x m x H)] // note: torque from LLNoPrecess has negative sign func Minimize(m, m0, torque *data.Slice, dt float32) { N := m.Len() cfg := make1DConf(N) k_minimize_async(m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), m0.DevPtr(X), m0.DevPtr(Y), m0.DevPtr(Z), torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), dt, N, cfg) } 3-3.11.1/cuda/minimize_wrapper.go000066400000000000000000001177671503346766200166220ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for minimize kernel var minimize_code cu.Function // Stores the arguments for minimize kernel invocation type minimize_args_t struct { arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_m0x unsafe.Pointer arg_m0y unsafe.Pointer arg_m0z unsafe.Pointer arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_dt float32 arg_N int argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for minimize kernel invocation var minimize_args minimize_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. minimize_args.argptr[0] = unsafe.Pointer(&minimize_args.arg_mx) minimize_args.argptr[1] = unsafe.Pointer(&minimize_args.arg_my) minimize_args.argptr[2] = unsafe.Pointer(&minimize_args.arg_mz) minimize_args.argptr[3] = unsafe.Pointer(&minimize_args.arg_m0x) minimize_args.argptr[4] = unsafe.Pointer(&minimize_args.arg_m0y) minimize_args.argptr[5] = unsafe.Pointer(&minimize_args.arg_m0z) minimize_args.argptr[6] = unsafe.Pointer(&minimize_args.arg_tx) minimize_args.argptr[7] = unsafe.Pointer(&minimize_args.arg_ty) minimize_args.argptr[8] = unsafe.Pointer(&minimize_args.arg_tz) minimize_args.argptr[9] = unsafe.Pointer(&minimize_args.arg_dt) minimize_args.argptr[10] = unsafe.Pointer(&minimize_args.arg_N) } // Wrapper for minimize CUDA kernel, asynchronous. func k_minimize_async(mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, m0x unsafe.Pointer, m0y unsafe.Pointer, m0z unsafe.Pointer, tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, dt float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("minimize") } minimize_args.Lock() defer minimize_args.Unlock() if minimize_code == 0 { minimize_code = fatbinLoad(minimize_map, "minimize") } minimize_args.arg_mx = mx minimize_args.arg_my = my minimize_args.arg_mz = mz minimize_args.arg_m0x = m0x minimize_args.arg_m0y = m0y minimize_args.arg_m0z = m0z minimize_args.arg_tx = tx minimize_args.arg_ty = ty minimize_args.arg_tz = tz minimize_args.arg_dt = dt minimize_args.arg_N = N args := minimize_args.argptr[:] cu.LaunchKernel(minimize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("minimize") } } // maps compute capability on PTX code for minimize kernel. var minimize_map = map[int]string{0: "", 50: minimize_ptx_50, 52: minimize_ptx_52, 53: minimize_ptx_53, 60: minimize_ptx_60, 61: minimize_ptx_61, 62: minimize_ptx_62, 70: minimize_ptx_70, 72: minimize_ptx_72, 75: minimize_ptx_75, 80: minimize_ptx_80, 86: minimize_ptx_86, 87: minimize_ptx_87, 89: minimize_ptx_89, 90: minimize_ptx_90} // minimize PTX code for various compute capabilities. const ( minimize_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` minimize_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/mslice.go000066400000000000000000000014621503346766200144750ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "unsafe" ) // Slice + scalar multiplier. type MSlice struct { arr *data.Slice mul []float64 } func ToMSlice(s *data.Slice) MSlice { return MSlice{ arr: s, mul: ones(s.NComp()), } } func MakeMSlice(arr *data.Slice, mul []float64) MSlice { return MSlice{arr, mul} } func (m MSlice) Size() [3]int { return m.arr.Size() } func (m MSlice) Len() int { return m.arr.Len() } func (m MSlice) DevPtr(c int) unsafe.Pointer { return m.arr.DevPtr(c) } func (m MSlice) Mul(c int) float32 { return float32(m.mul[c]) } func (m MSlice) SetMul(c int, mul float32) { m.mul[c] = float64(mul) } func (m MSlice) Recycle() { if m.arr != nil { Recycle(m.arr) m.arr = nil } } var _ones = [4]float64{1, 1, 1, 1} func ones(n int) []float64 { return _ones[:n] } 3-3.11.1/cuda/mul.cu000066400000000000000000000004211503346766200140120ustar00rootroot00000000000000// dst[i] = a[i] * b[i] extern "C" __global__ void mul(float* __restrict__ dst, float* __restrict__ a, float* __restrict__ b, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = a[i] * b[i]; } } 3-3.11.1/cuda/mul_wrapper.go000066400000000000000000000374541503346766200155700ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for mul kernel var mul_code cu.Function // Stores the arguments for mul kernel invocation type mul_args_t struct { arg_dst unsafe.Pointer arg_a unsafe.Pointer arg_b unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for mul kernel invocation var mul_args mul_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. mul_args.argptr[0] = unsafe.Pointer(&mul_args.arg_dst) mul_args.argptr[1] = unsafe.Pointer(&mul_args.arg_a) mul_args.argptr[2] = unsafe.Pointer(&mul_args.arg_b) mul_args.argptr[3] = unsafe.Pointer(&mul_args.arg_N) } // Wrapper for mul CUDA kernel, asynchronous. func k_mul_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("mul") } mul_args.Lock() defer mul_args.Unlock() if mul_code == 0 { mul_code = fatbinLoad(mul_map, "mul") } mul_args.arg_dst = dst mul_args.arg_a = a mul_args.arg_b = b mul_args.arg_N = N args := mul_args.argptr[:] cu.LaunchKernel(mul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("mul") } } // maps compute capability on PTX code for mul kernel. var mul_map = map[int]string{0: "", 50: mul_ptx_50, 52: mul_ptx_52, 53: mul_ptx_53, 60: mul_ptx_60, 61: mul_ptx_61, 62: mul_ptx_62, 70: mul_ptx_70, 72: mul_ptx_72, 75: mul_ptx_75, 80: mul_ptx_80, 86: mul_ptx_86, 87: mul_ptx_87, 89: mul_ptx_89, 90: mul_ptx_90} // mul PTX code for various compute capabilities. const ( mul_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` mul_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/normalize.cu000066400000000000000000000010301503346766200152120ustar00rootroot00000000000000#include "float3.h" // normalize vector {vx, vy, vz} to unit length, unless length or vol are zero. extern "C" __global__ void normalize(float* __restrict__ vx, float* __restrict__ vy, float* __restrict__ vz, float* __restrict__ vol, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float v = (vol == NULL? 1.0f: vol[i]); float3 V = {v*vx[i], v*vy[i], v*vz[i]}; V = normalized(V); vx[i] = V.x; vy[i] = V.y; vz[i] = V.z; } } 3-3.11.1/cuda/normalize.go000066400000000000000000000005451503346766200152220ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Normalize vec to unit length, unless length or vol are zero. func Normalize(vec, vol *data.Slice) { util.Argument(vol == nil || vol.NComp() == 1) N := vec.Len() cfg := make1DConf(N) k_normalize_async(vec.DevPtr(X), vec.DevPtr(Y), vec.DevPtr(Z), vol.DevPtr(0), N, cfg) } 3-3.11.1/cuda/normalize_wrapper.go000066400000000000000000000700721503346766200167640ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for normalize kernel var normalize_code cu.Function // Stores the arguments for normalize kernel invocation type normalize_args_t struct { arg_vx unsafe.Pointer arg_vy unsafe.Pointer arg_vz unsafe.Pointer arg_vol unsafe.Pointer arg_N int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for normalize kernel invocation var normalize_args normalize_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. normalize_args.argptr[0] = unsafe.Pointer(&normalize_args.arg_vx) normalize_args.argptr[1] = unsafe.Pointer(&normalize_args.arg_vy) normalize_args.argptr[2] = unsafe.Pointer(&normalize_args.arg_vz) normalize_args.argptr[3] = unsafe.Pointer(&normalize_args.arg_vol) normalize_args.argptr[4] = unsafe.Pointer(&normalize_args.arg_N) } // Wrapper for normalize CUDA kernel, asynchronous. func k_normalize_async(vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, vol unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("normalize") } normalize_args.Lock() defer normalize_args.Unlock() if normalize_code == 0 { normalize_code = fatbinLoad(normalize_map, "normalize") } normalize_args.arg_vx = vx normalize_args.arg_vy = vy normalize_args.arg_vz = vz normalize_args.arg_vol = vol normalize_args.arg_N = N args := normalize_args.argptr[:] cu.LaunchKernel(normalize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("normalize") } } // maps compute capability on PTX code for normalize kernel. var normalize_map = map[int]string{0: "", 50: normalize_ptx_50, 52: normalize_ptx_52, 53: normalize_ptx_53, 60: normalize_ptx_60, 61: normalize_ptx_61, 62: normalize_ptx_62, 70: normalize_ptx_70, 72: normalize_ptx_72, 75: normalize_ptx_75, 80: normalize_ptx_80, 86: normalize_ptx_86, 87: normalize_ptx_87, 89: normalize_ptx_89, 90: normalize_ptx_90} // normalize PTX code for various compute capabilities. const ( normalize_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` normalize_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_7; setp.eq.s64 %p2, %rd7, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; bra.uni $L__BB0_4; $L__BB0_3: mov.f32 %f20, 0f3F800000; $L__BB0_4: cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd1, %rd11, %rd12; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd2, %rd13, %rd12; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; cvta.to.global.u64 %rd14, %rd6; add.s64 %rd3, %rd14, %rd12; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; setp.eq.f32 %p3, %f6, 0f00000000; mov.f32 %f21, 0f00000000; @%p3 bra $L__BB0_6; rcp.rn.f32 %f21, %f6; $L__BB0_6: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; $L__BB0_7: ret; } ` ) 3-3.11.1/cuda/phi.cu000066400000000000000000000007541503346766200140060ustar00rootroot00000000000000#include "stencil.h" extern "C" __global__ void setPhi(float* __restrict__ phi, float* __restrict__ mx, float* __restrict__ my, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index phi[I] = atan2f(my[I], mx[I]); }3-3.11.1/cuda/phi_wrapper.go000066400000000000000000001413761503346766200155520ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setPhi kernel var setPhi_code cu.Function // Stores the arguments for setPhi kernel invocation type setPhi_args_t struct { arg_phi unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for setPhi kernel invocation var setPhi_args setPhi_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setPhi_args.argptr[0] = unsafe.Pointer(&setPhi_args.arg_phi) setPhi_args.argptr[1] = unsafe.Pointer(&setPhi_args.arg_mx) setPhi_args.argptr[2] = unsafe.Pointer(&setPhi_args.arg_my) setPhi_args.argptr[3] = unsafe.Pointer(&setPhi_args.arg_Nx) setPhi_args.argptr[4] = unsafe.Pointer(&setPhi_args.arg_Ny) setPhi_args.argptr[5] = unsafe.Pointer(&setPhi_args.arg_Nz) } // Wrapper for setPhi CUDA kernel, asynchronous. func k_setPhi_async(phi unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("setPhi") } setPhi_args.Lock() defer setPhi_args.Unlock() if setPhi_code == 0 { setPhi_code = fatbinLoad(setPhi_map, "setPhi") } setPhi_args.arg_phi = phi setPhi_args.arg_mx = mx setPhi_args.arg_my = my setPhi_args.arg_Nx = Nx setPhi_args.arg_Ny = Ny setPhi_args.arg_Nz = Nz args := setPhi_args.argptr[:] cu.LaunchKernel(setPhi_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setPhi") } } // maps compute capability on PTX code for setPhi kernel. var setPhi_map = map[int]string{0: "", 50: setPhi_ptx_50, 52: setPhi_ptx_52, 53: setPhi_ptx_53, 60: setPhi_ptx_60, 61: setPhi_ptx_61, 62: setPhi_ptx_62, 70: setPhi_ptx_70, 72: setPhi_ptx_72, 75: setPhi_ptx_75, 80: setPhi_ptx_80, 86: setPhi_ptx_86, 87: setPhi_ptx_87, 89: setPhi_ptx_89, 90: setPhi_ptx_90} // setPhi PTX code for various compute capabilities. const ( setPhi_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` setPhi_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<16>; .reg .f32 %f<36>; .reg .b32 %r<34>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [setPhi_param_0]; ld.param.u64 %rd3, [setPhi_param_1]; ld.param.u64 %rd4, [setPhi_param_2]; ld.param.u32 %r4, [setPhi_param_3]; ld.param.u32 %r5, [setPhi_param_4]; ld.param.u32 %r6, [setPhi_param_5]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_7; cvta.to.global.u64 %rd5, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; cvt.s64.s32 %rd1, %r17; mul.wide.s32 %rd6, %r17, 4; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f1, [%rd9]; abs.f32 %f2, %f1; ld.global.nc.f32 %f3, [%rd7]; abs.f32 %f4, %f3; setp.eq.f32 %p6, %f2, 0f00000000; setp.eq.f32 %p7, %f4, 0f00000000; and.pred %p8, %p6, %p7; @%p8 bra $L__BB0_5; bra.uni $L__BB0_2; $L__BB0_5: mov.b32 %r28, %f1; shr.s32 %r29, %r28, 31; and.b32 %r30, %r29, 1078530011; mov.b32 %r31, %f3; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r30, %r32; mov.b32 %f35, %r33; bra.uni $L__BB0_6; $L__BB0_2: setp.eq.f32 %p9, %f2, 0f7F800000; setp.eq.f32 %p10, %f4, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: mov.b32 %r23, %f1; setp.lt.s32 %p15, %r23, 0; selp.b32 %r24, 1075235812, 1061752795, %p15; mov.b32 %r25, %f3; and.b32 %r26, %r25, -2147483648; or.b32 %r27, %r24, %r26; mov.b32 %f35, %r27; bra.uni $L__BB0_6; $L__BB0_3: max.f32 %f9, %f4, %f2; min.f32 %f10, %f4, %f2; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f4, %f2; selp.f32 %f29, %f28, %f26, %p12; mov.b32 %r18, %f1; setp.lt.s32 %p13, %r18, 0; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r19, %f32; mov.b32 %r20, %f3; and.b32 %r21, %r20, -2147483648; or.b32 %r22, %r21, %r19; mov.b32 %f33, %r22; add.f32 %f34, %f2, %f4; setp.le.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f33, %f34, %p14; $L__BB0_6: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f35; $L__BB0_7: ret; } ` ) 3-3.11.1/cuda/reduce.go000066400000000000000000000054521503346766200144730ustar00rootroot00000000000000package cuda import ( "math" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" ) //#include "reduce.h" import "C" // Block size for reduce kernels. const REDUCE_BLOCKSIZE = C.REDUCE_BLOCKSIZE // Sum of all elements. func Sum(in *data.Slice) float32 { util.Argument(in.NComp() == 1) out := reduceBuf(0) k_reducesum_async(in.DevPtr(0), out, 0, in.Len(), reducecfg) return copyback(out) } // Dot product. func Dot(a, b *data.Slice) float32 { nComp := a.NComp() util.Argument(nComp == b.NComp()) out := reduceBuf(0) // not async over components for c := 0; c < nComp; c++ { k_reducedot_async(a.DevPtr(c), b.DevPtr(c), out, 0, a.Len(), reducecfg) // all components add to out } return copyback(out) } // Maximum of absolute values of all elements. func MaxAbs(in *data.Slice) float32 { util.Argument(in.NComp() == 1) out := reduceBuf(0) k_reducemaxabs_async(in.DevPtr(0), out, 0, in.Len(), reducecfg) return copyback(out) } // Maximum of the norms of all vectors (x[i], y[i], z[i]). // // max_i sqrt( x[i]*x[i] + y[i]*y[i] + z[i]*z[i] ) func MaxVecNorm(v *data.Slice) float64 { out := reduceBuf(0) k_reducemaxvecnorm2_async(v.DevPtr(0), v.DevPtr(1), v.DevPtr(2), out, 0, v.Len(), reducecfg) return math.Sqrt(float64(copyback(out))) } // Maximum of the norms of the difference between all vectors (x1,y1,z1) and (x2,y2,z2) // // (dx, dy, dz) = (x1, y1, z1) - (x2, y2, z2) // max_i sqrt( dx[i]*dx[i] + dy[i]*dy[i] + dz[i]*dz[i] ) func MaxVecDiff(x, y *data.Slice) float64 { util.Argument(x.Len() == y.Len()) out := reduceBuf(0) k_reducemaxvecdiff2_async(x.DevPtr(0), x.DevPtr(1), x.DevPtr(2), y.DevPtr(0), y.DevPtr(1), y.DevPtr(2), out, 0, x.Len(), reducecfg) return math.Sqrt(float64(copyback(out))) } var reduceBuffers chan unsafe.Pointer // pool of 1-float CUDA buffers for reduce // return a 1-float CUDA reduction buffer from a pool // initialized to initVal func reduceBuf(initVal float32) unsafe.Pointer { if reduceBuffers == nil { initReduceBuf() } buf := <-reduceBuffers cu.MemsetD32Async(cu.DevicePtr(uintptr(buf)), math.Float32bits(initVal), 1, stream0) return buf } // copy back single float result from GPU and recycle buffer func copyback(buf unsafe.Pointer) float32 { var result float32 MemCpyDtoH(unsafe.Pointer(&result), buf, cu.SIZEOF_FLOAT32) reduceBuffers <- buf return result } // initialize pool of 1-float CUDA reduction buffers func initReduceBuf() { const N = 128 reduceBuffers = make(chan unsafe.Pointer, N) for i := 0; i < N; i++ { reduceBuffers <- MemAlloc(1 * cu.SIZEOF_FLOAT32) } } // launch configuration for reduce kernels // 8 is typ. number of multiprocessors. // could be improved but takes hardly ~1% of execution time var reducecfg = &config{Grid: cu.Dim3{X: 8, Y: 1, Z: 1}, Block: cu.Dim3{X: REDUCE_BLOCKSIZE, Y: 1, Z: 1}} 3-3.11.1/cuda/reduce.h000066400000000000000000000044441503346766200143150ustar00rootroot00000000000000#ifndef _REDUCE_H_ #define _REDUCE_H_ // Block size for reduce kernels. #define REDUCE_BLOCKSIZE 512 // This macro expands to a reduce kernel with arbitrary reduce operation. // Ugly, perhaps, but arguably nicer than some 1000+ line C++ template. // load(i): loads element i, possibly pre-processing the data // op(a, b): reduce operation. e.g. sum // atomicOp(a, b): atomic reduce operation in global mem. #define reduce(load, op, atomicOp) \ __shared__ float sdata[REDUCE_BLOCKSIZE]; \ int tid = threadIdx.x; \ int i = blockIdx.x * blockDim.x + threadIdx.x; \ \ float mine = initVal; \ int stride = gridDim.x * blockDim.x; \ while (i < n) { \ mine = op(mine, load(i)); \ i += stride; \ } \ sdata[tid] = mine; \ __syncthreads(); \ \ for (unsigned int s=blockDim.x/2; s>32; s>>=1) { \ if (tid < s){ \ sdata[tid] = op(sdata[tid], sdata[tid + s]);\ } \ __syncthreads(); \ } \ \ if (tid < 32) { \ volatile float* smem = sdata; \ smem[tid] = op(smem[tid], smem[tid + 32]); \ smem[tid] = op(smem[tid], smem[tid + 16]); \ smem[tid] = op(smem[tid], smem[tid + 8]); \ smem[tid] = op(smem[tid], smem[tid + 4]); \ smem[tid] = op(smem[tid], smem[tid + 2]); \ smem[tid] = op(smem[tid], smem[tid + 1]); \ } \ \ if (tid == 0) { atomicOp(dst, sdata[0]); } \ // Based on "Optimizing parallel reduction in CUDA" by Mark Harris. #endif 3-3.11.1/cuda/reduce_test.go000066400000000000000000000031411503346766200155230ustar00rootroot00000000000000package cuda import ( "testing" "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // test input data var in1, in2, in3 *data.Slice func initTest() { if in1 != nil { return } { inh1 := make([]float32, 1000) for i := range inh1 { inh1[i] = float32(i) } in1 = toGPU(inh1) } { inh2 := make([]float32, 100000) for i := range inh2 { inh2[i] = -float32(i) / 100 } in2 = toGPU(inh2) } } func toGPU(list []float32) *data.Slice { mesh := [3]int{1, 1, len(list)} h := sliceFromList([][]float32{list}, mesh) d := NewSlice(1, mesh) data.Copy(d, h) return d } func TestReduceSum(t *testing.T) { initTest() result := Sum(in1) if result != 499500 { t.Error("got:", result) } } func TestReduceDot(t *testing.T) { initTest() // test for 1 comp a := toGPU([]float32{1, 2, 3, 4, 5}) b := toGPU([]float32{5, 4, 3, -1, 2}) result := Dot(a, b) if result != 5+8+9-4+10 { t.Error("got:", result) } // test for 3 comp const N = 32 mesh := [3]int{1, 1, N} c := NewSlice(3, mesh) d := NewSlice(3, mesh) Memset(c, 1, 2, 3) Memset(d, 4, 5, 6) result = Dot(c, d) if result != N*(4+10+18) { t.Error("got:", result) } } func TestReduceMaxAbs(t *testing.T) { result := MaxAbs(in1) if result != 999 { t.Error("got:", result) } result = MaxAbs(in2) if result != 999.99 { t.Error("got:", result) } } func sliceFromList(arr [][]float32, size [3]int) *data.Slice { ptrs := make([]unsafe.Pointer, len(arr)) for i := range ptrs { util.Argument(len(arr[i]) == prod(size)) ptrs[i] = unsafe.Pointer(&arr[i][0]) } return data.SliceFromPtrs(size, data.CPUMemory, ptrs) } 3-3.11.1/cuda/reducedot.cu000066400000000000000000000004051503346766200151750ustar00rootroot00000000000000#include "reduce.h" #include "sum.h" #define load_prod(i) (x1[i] * x2[i]) extern "C" __global__ void reducedot(float* __restrict__ x1, float* __restrict__ x2, float*__restrict__ dst, float initVal, int n) { reduce(load_prod, sum, atomicAdd) } 3-3.11.1/cuda/reducedot_wrapper.go000066400000000000000000001713061503346766200167440ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducedot kernel var reducedot_code cu.Function // Stores the arguments for reducedot kernel invocation type reducedot_args_t struct { arg_x1 unsafe.Pointer arg_x2 unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for reducedot kernel invocation var reducedot_args reducedot_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducedot_args.argptr[0] = unsafe.Pointer(&reducedot_args.arg_x1) reducedot_args.argptr[1] = unsafe.Pointer(&reducedot_args.arg_x2) reducedot_args.argptr[2] = unsafe.Pointer(&reducedot_args.arg_dst) reducedot_args.argptr[3] = unsafe.Pointer(&reducedot_args.arg_initVal) reducedot_args.argptr[4] = unsafe.Pointer(&reducedot_args.arg_n) } // Wrapper for reducedot CUDA kernel, asynchronous. func k_reducedot_async(x1 unsafe.Pointer, x2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducedot") } reducedot_args.Lock() defer reducedot_args.Unlock() if reducedot_code == 0 { reducedot_code = fatbinLoad(reducedot_map, "reducedot") } reducedot_args.arg_x1 = x1 reducedot_args.arg_x2 = x2 reducedot_args.arg_dst = dst reducedot_args.arg_initVal = initVal reducedot_args.arg_n = n args := reducedot_args.argptr[:] cu.LaunchKernel(reducedot_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducedot") } } // maps compute capability on PTX code for reducedot kernel. var reducedot_map = map[int]string{0: "", 50: reducedot_ptx_50, 52: reducedot_ptx_52, 53: reducedot_ptx_53, 60: reducedot_ptx_60, 61: reducedot_ptx_61, 62: reducedot_ptx_62, 70: reducedot_ptx_70, 72: reducedot_ptx_72, 75: reducedot_ptx_75, 80: reducedot_ptx_80, 86: reducedot_ptx_86, 87: reducedot_ptx_87, 89: reducedot_ptx_89, 90: reducedot_ptx_90} // reducedot PTX code for various compute capabilities. const ( reducedot_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` reducedot_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<37>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd12, [reducedot_param_0]; ld.param.u64 %rd13, [reducedot_param_1]; ld.param.u64 %rd11, [reducedot_param_2]; ld.param.f32 %f50, [reducedot_param_3]; ld.param.u32 %r17, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r34, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; fma.rn.f32 %f50, %f11, %f10, %f50; add.s32 %r34, %r34, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r34, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f12, [%rd17]; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f12, %f50; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f15, [%rd19]; ld.global.nc.f32 %f16, [%rd18]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f18, [%rd21]; ld.global.nc.f32 %f19, [%rd20]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f21, [%rd23]; ld.global.nc.f32 %f22, [%rd22]; fma.rn.f32 %f50, %f22, %f21, %f20; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducedotE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; add.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; add.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; add.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; add.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; add.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; add.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; add.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd24, %rd11; atom.global.add.f32 %f45, [%rd24], %f44; $L__BB0_15: ret; } ` ) 3-3.11.1/cuda/reducemaxabs.cu000066400000000000000000000003551503346766200156660ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #define load_fabs(i) fabs(src[i]) extern "C" __global__ void reducemaxabs(float* __restrict__ src, float* __restrict__ dst, float initVal, int n) { reduce(load_fabs, fmax, atomicFmaxabs) } 3-3.11.1/cuda/reducemaxabs_wrapper.go000066400000000000000000001605501503346766200174300ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxabs kernel var reducemaxabs_code cu.Function // Stores the arguments for reducemaxabs kernel invocation type reducemaxabs_args_t struct { arg_src unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxabs kernel invocation var reducemaxabs_args reducemaxabs_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxabs_args.argptr[0] = unsafe.Pointer(&reducemaxabs_args.arg_src) reducemaxabs_args.argptr[1] = unsafe.Pointer(&reducemaxabs_args.arg_dst) reducemaxabs_args.argptr[2] = unsafe.Pointer(&reducemaxabs_args.arg_initVal) reducemaxabs_args.argptr[3] = unsafe.Pointer(&reducemaxabs_args.arg_n) } // Wrapper for reducemaxabs CUDA kernel, asynchronous. func k_reducemaxabs_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxabs") } reducemaxabs_args.Lock() defer reducemaxabs_args.Unlock() if reducemaxabs_code == 0 { reducemaxabs_code = fatbinLoad(reducemaxabs_map, "reducemaxabs") } reducemaxabs_args.arg_src = src reducemaxabs_args.arg_dst = dst reducemaxabs_args.arg_initVal = initVal reducemaxabs_args.arg_n = n args := reducemaxabs_args.argptr[:] cu.LaunchKernel(reducemaxabs_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxabs") } } // maps compute capability on PTX code for reducemaxabs kernel. var reducemaxabs_map = map[int]string{0: "", 50: reducemaxabs_ptx_50, 52: reducemaxabs_ptx_52, 53: reducemaxabs_ptx_53, 60: reducemaxabs_ptx_60, 61: reducemaxabs_ptx_61, 62: reducemaxabs_ptx_62, 70: reducemaxabs_ptx_70, 72: reducemaxabs_ptx_72, 75: reducemaxabs_ptx_75, 80: reducemaxabs_ptx_80, 86: reducemaxabs_ptx_86, 87: reducemaxabs_ptx_87, 89: reducemaxabs_ptx_89, 90: reducemaxabs_ptx_90} // reducemaxabs PTX code for various compute capabilities. const ( reducemaxabs_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` reducemaxabs_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<51>; .reg .b32 %r<39>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd8, [reducemaxabs_param_0]; ld.param.u64 %rd7, [reducemaxabs_param_1]; ld.param.f32 %f50, [reducemaxabs_param_2]; ld.param.u32 %r17, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r36, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; abs.f32 %f11, %f10; max.f32 %f50, %f50, %f11; add.s32 %r36, %r36, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r36, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f12, [%rd11]; abs.f32 %f13, %f12; max.f32 %f14, %f50, %f13; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f15, [%rd12]; abs.f32 %f16, %f15; max.f32 %f17, %f14, %f16; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f18, [%rd13]; abs.f32 %f19, %f18; max.f32 %f20, %f17, %f19; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f21, [%rd14]; abs.f32 %f22, %f21; max.f32 %f50, %f20, %f22; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ12reducemaxabsE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f50; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f23, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f24, [%r31]; max.f32 %f25, %f23, %f24; st.shared.f32 [%r14], %f25; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f26, [%r14]; ld.volatile.shared.f32 %f27, [%r14+128]; max.f32 %f28, %f26, %f27; st.volatile.shared.f32 [%r14], %f28; ld.volatile.shared.f32 %f29, [%r14+64]; ld.volatile.shared.f32 %f30, [%r14]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r14], %f31; ld.volatile.shared.f32 %f32, [%r14+32]; ld.volatile.shared.f32 %f33, [%r14]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r14], %f34; ld.volatile.shared.f32 %f35, [%r14+16]; ld.volatile.shared.f32 %f36, [%r14]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r14], %f37; ld.volatile.shared.f32 %f38, [%r14+8]; ld.volatile.shared.f32 %f39, [%r14]; max.f32 %f40, %f39, %f38; st.volatile.shared.f32 [%r14], %f40; ld.volatile.shared.f32 %f41, [%r14+4]; ld.volatile.shared.f32 %f42, [%r14]; max.f32 %f43, %f42, %f41; st.volatile.shared.f32 [%r14], %f43; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f44, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f45, %f44; mov.b32 %r32, %f45; cvta.to.global.u64 %rd15, %rd7; atom.global.max.s32 %r33, [%rd15], %r32; $L__BB0_15: ret; } ` ) 3-3.11.1/cuda/reducemaxdiff.cu000066400000000000000000000004251503346766200160270ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #define load_diff(i) fabs(src1[i] - src2[i]) extern "C" __global__ void reducemaxdiff(float* __restrict__ src1, float* __restrict__ src2, float* __restrict__ dst, float initVal, int n) { reduce(load_diff, fmax, atomicFmaxabs) } 3-3.11.1/cuda/reducemaxdiff_wrapper.go000066400000000000000000002023521503346766200175700ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxdiff kernel var reducemaxdiff_code cu.Function // Stores the arguments for reducemaxdiff kernel invocation type reducemaxdiff_args_t struct { arg_src1 unsafe.Pointer arg_src2 unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxdiff kernel invocation var reducemaxdiff_args reducemaxdiff_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxdiff_args.argptr[0] = unsafe.Pointer(&reducemaxdiff_args.arg_src1) reducemaxdiff_args.argptr[1] = unsafe.Pointer(&reducemaxdiff_args.arg_src2) reducemaxdiff_args.argptr[2] = unsafe.Pointer(&reducemaxdiff_args.arg_dst) reducemaxdiff_args.argptr[3] = unsafe.Pointer(&reducemaxdiff_args.arg_initVal) reducemaxdiff_args.argptr[4] = unsafe.Pointer(&reducemaxdiff_args.arg_n) } // Wrapper for reducemaxdiff CUDA kernel, asynchronous. func k_reducemaxdiff_async(src1 unsafe.Pointer, src2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxdiff") } reducemaxdiff_args.Lock() defer reducemaxdiff_args.Unlock() if reducemaxdiff_code == 0 { reducemaxdiff_code = fatbinLoad(reducemaxdiff_map, "reducemaxdiff") } reducemaxdiff_args.arg_src1 = src1 reducemaxdiff_args.arg_src2 = src2 reducemaxdiff_args.arg_dst = dst reducemaxdiff_args.arg_initVal = initVal reducemaxdiff_args.arg_n = n args := reducemaxdiff_args.argptr[:] cu.LaunchKernel(reducemaxdiff_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxdiff") } } // maps compute capability on PTX code for reducemaxdiff kernel. var reducemaxdiff_map = map[int]string{0: "", 50: reducemaxdiff_ptx_50, 52: reducemaxdiff_ptx_52, 53: reducemaxdiff_ptx_53, 60: reducemaxdiff_ptx_60, 61: reducemaxdiff_ptx_61, 62: reducemaxdiff_ptx_62, 70: reducemaxdiff_ptx_70, 72: reducemaxdiff_ptx_72, 75: reducemaxdiff_ptx_75, 80: reducemaxdiff_ptx_80, 86: reducemaxdiff_ptx_86, 87: reducemaxdiff_ptx_87, 89: reducemaxdiff_ptx_89, 90: reducemaxdiff_ptx_90} // reducemaxdiff PTX code for various compute capabilities. const ( reducemaxdiff_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` reducemaxdiff_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<11>; .reg .f32 %f<61>; .reg .b32 %r<39>; .reg .b64 %rd<27>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd12, [reducemaxdiff_param_0]; ld.param.u64 %rd13, [reducemaxdiff_param_1]; ld.param.u64 %rd11, [reducemaxdiff_param_2]; ld.param.f32 %f60, [reducemaxdiff_param_3]; ld.param.u32 %r17, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd12; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd14, %r36, 4; add.s64 %rd26, %rd1, %rd14; mul.wide.s32 %rd4, %r4, 4; add.s64 %rd25, %rd2, %rd14; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd26]; ld.global.nc.f32 %f11, [%rd25]; sub.f32 %f12, %f11, %f10; abs.f32 %f13, %f12; max.f32 %f60, %f60, %f13; add.s32 %r36, %r36, %r4; add.s64 %rd26, %rd26, %rd4; add.s64 %rd25, %rd25, %rd4; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd10, %r4, 4; $L__BB0_6: mul.wide.s32 %rd15, %r36, 4; add.s64 %rd16, %rd2, %rd15; add.s64 %rd17, %rd1, %rd15; ld.global.nc.f32 %f14, [%rd17]; ld.global.nc.f32 %f15, [%rd16]; sub.f32 %f16, %f15, %f14; abs.f32 %f17, %f16; max.f32 %f18, %f60, %f17; add.s64 %rd18, %rd16, %rd10; add.s64 %rd19, %rd17, %rd10; ld.global.nc.f32 %f19, [%rd19]; ld.global.nc.f32 %f20, [%rd18]; sub.f32 %f21, %f20, %f19; abs.f32 %f22, %f21; max.f32 %f23, %f18, %f22; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd20, %rd18, %rd10; add.s64 %rd21, %rd19, %rd10; ld.global.nc.f32 %f24, [%rd21]; ld.global.nc.f32 %f25, [%rd20]; sub.f32 %f26, %f25, %f24; abs.f32 %f27, %f26; max.f32 %f28, %f23, %f27; add.s32 %r27, %r26, %r4; add.s64 %rd22, %rd20, %rd10; add.s64 %rd23, %rd21, %rd10; ld.global.nc.f32 %f29, [%rd23]; ld.global.nc.f32 %f30, [%rd22]; sub.f32 %f31, %f30, %f29; abs.f32 %f32, %f31; max.f32 %f60, %f28, %f32; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ13reducemaxdiffE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f60; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f33, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f34, [%r31]; max.f32 %f35, %f33, %f34; st.shared.f32 [%r14], %f35; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f36, [%r14]; ld.volatile.shared.f32 %f37, [%r14+128]; max.f32 %f38, %f36, %f37; st.volatile.shared.f32 [%r14], %f38; ld.volatile.shared.f32 %f39, [%r14+64]; ld.volatile.shared.f32 %f40, [%r14]; max.f32 %f41, %f40, %f39; st.volatile.shared.f32 [%r14], %f41; ld.volatile.shared.f32 %f42, [%r14+32]; ld.volatile.shared.f32 %f43, [%r14]; max.f32 %f44, %f43, %f42; st.volatile.shared.f32 [%r14], %f44; ld.volatile.shared.f32 %f45, [%r14+16]; ld.volatile.shared.f32 %f46, [%r14]; max.f32 %f47, %f46, %f45; st.volatile.shared.f32 [%r14], %f47; ld.volatile.shared.f32 %f48, [%r14+8]; ld.volatile.shared.f32 %f49, [%r14]; max.f32 %f50, %f49, %f48; st.volatile.shared.f32 [%r14], %f50; ld.volatile.shared.f32 %f51, [%r14+4]; ld.volatile.shared.f32 %f52, [%r14]; max.f32 %f53, %f52, %f51; st.volatile.shared.f32 [%r14], %f53; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f54, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f55, %f54; mov.b32 %r32, %f55; cvta.to.global.u64 %rd24, %rd11; atom.global.max.s32 %r33, [%rd24], %r32; $L__BB0_15: ret; } ` ) 3-3.11.1/cuda/reducemaxvecdiff2.cu000066400000000000000000000007521503346766200166120ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #include "float3.h" #define load_vecdiff2(i) \ pow2(x1[i] - x2[i]) + \ pow2(y1[i] - y2[i]) + \ pow2(z1[i] - z2[i]) \ extern "C" __global__ void reducemaxvecdiff2(float* __restrict__ x1, float* __restrict__ y1, float* __restrict__ z1, float* __restrict__ x2, float* __restrict__ y2, float* __restrict__ z2, float* __restrict__ dst, float initVal, int n) { reduce(load_vecdiff2, fmax, atomicFmaxabs) } 3-3.11.1/cuda/reducemaxvecdiff2_wrapper.go000066400000000000000000003200341503346766200203460ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxvecdiff2 kernel var reducemaxvecdiff2_code cu.Function // Stores the arguments for reducemaxvecdiff2 kernel invocation type reducemaxvecdiff2_args_t struct { arg_x1 unsafe.Pointer arg_y1 unsafe.Pointer arg_z1 unsafe.Pointer arg_x2 unsafe.Pointer arg_y2 unsafe.Pointer arg_z2 unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxvecdiff2 kernel invocation var reducemaxvecdiff2_args reducemaxvecdiff2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxvecdiff2_args.argptr[0] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x1) reducemaxvecdiff2_args.argptr[1] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y1) reducemaxvecdiff2_args.argptr[2] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z1) reducemaxvecdiff2_args.argptr[3] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x2) reducemaxvecdiff2_args.argptr[4] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y2) reducemaxvecdiff2_args.argptr[5] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z2) reducemaxvecdiff2_args.argptr[6] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_dst) reducemaxvecdiff2_args.argptr[7] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_initVal) reducemaxvecdiff2_args.argptr[8] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_n) } // Wrapper for reducemaxvecdiff2 CUDA kernel, asynchronous. func k_reducemaxvecdiff2_async(x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.Pointer, x2 unsafe.Pointer, y2 unsafe.Pointer, z2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxvecdiff2") } reducemaxvecdiff2_args.Lock() defer reducemaxvecdiff2_args.Unlock() if reducemaxvecdiff2_code == 0 { reducemaxvecdiff2_code = fatbinLoad(reducemaxvecdiff2_map, "reducemaxvecdiff2") } reducemaxvecdiff2_args.arg_x1 = x1 reducemaxvecdiff2_args.arg_y1 = y1 reducemaxvecdiff2_args.arg_z1 = z1 reducemaxvecdiff2_args.arg_x2 = x2 reducemaxvecdiff2_args.arg_y2 = y2 reducemaxvecdiff2_args.arg_z2 = z2 reducemaxvecdiff2_args.arg_dst = dst reducemaxvecdiff2_args.arg_initVal = initVal reducemaxvecdiff2_args.arg_n = n args := reducemaxvecdiff2_args.argptr[:] cu.LaunchKernel(reducemaxvecdiff2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxvecdiff2") } } // maps compute capability on PTX code for reducemaxvecdiff2 kernel. var reducemaxvecdiff2_map = map[int]string{0: "", 50: reducemaxvecdiff2_ptx_50, 52: reducemaxvecdiff2_ptx_52, 53: reducemaxvecdiff2_ptx_53, 60: reducemaxvecdiff2_ptx_60, 61: reducemaxvecdiff2_ptx_61, 62: reducemaxvecdiff2_ptx_62, 70: reducemaxvecdiff2_ptx_70, 72: reducemaxvecdiff2_ptx_72, 75: reducemaxvecdiff2_ptx_75, 80: reducemaxvecdiff2_ptx_80, 86: reducemaxvecdiff2_ptx_86, 87: reducemaxvecdiff2_ptx_87, 89: reducemaxvecdiff2_ptx_89, 90: reducemaxvecdiff2_ptx_90} // reducemaxvecdiff2 PTX code for various compute capabilities. const ( reducemaxvecdiff2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` reducemaxvecdiff2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<11>; .reg .f32 %f<101>; .reg .b32 %r<39>; .reg .b64 %rd<70>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd31, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd27, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd32, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd28, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd29, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd33, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd30, [reducemaxvecdiff2_param_6]; ld.param.f32 %f100, [reducemaxvecdiff2_param_7]; ld.param.u32 %r17, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd33; cvta.to.global.u64 %rd2, %rd32; cvta.to.global.u64 %rd3, %rd31; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd34, %r36, 4; add.s64 %rd69, %rd1, %rd34; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd68, %rd2, %rd34; cvta.to.global.u64 %rd35, %rd29; add.s64 %rd67, %rd35, %rd34; cvta.to.global.u64 %rd36, %rd27; add.s64 %rd66, %rd36, %rd34; cvta.to.global.u64 %rd37, %rd28; add.s64 %rd65, %rd37, %rd34; add.s64 %rd64, %rd3, %rd34; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd65]; ld.global.nc.f32 %f11, [%rd64]; sub.f32 %f12, %f11, %f10; ld.global.nc.f32 %f13, [%rd67]; ld.global.nc.f32 %f14, [%rd66]; sub.f32 %f15, %f14, %f13; mul.f32 %f16, %f15, %f15; fma.rn.f32 %f17, %f12, %f12, %f16; ld.global.nc.f32 %f18, [%rd69]; ld.global.nc.f32 %f19, [%rd68]; sub.f32 %f20, %f19, %f18; fma.rn.f32 %f21, %f20, %f20, %f17; max.f32 %f100, %f100, %f21; add.s32 %r36, %r36, %r4; add.s64 %rd69, %rd69, %rd5; add.s64 %rd68, %rd68, %rd5; add.s64 %rd67, %rd67, %rd5; add.s64 %rd66, %rd66, %rd5; add.s64 %rd65, %rd65, %rd5; add.s64 %rd64, %rd64, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd23, %r4, 4; cvta.to.global.u64 %rd24, %rd28; cvta.to.global.u64 %rd25, %rd27; cvta.to.global.u64 %rd26, %rd29; $L__BB0_6: mul.wide.s32 %rd38, %r36, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd24, %rd38; ld.global.nc.f32 %f22, [%rd40]; ld.global.nc.f32 %f23, [%rd39]; sub.f32 %f24, %f23, %f22; add.s64 %rd41, %rd25, %rd38; add.s64 %rd42, %rd26, %rd38; ld.global.nc.f32 %f25, [%rd42]; ld.global.nc.f32 %f26, [%rd41]; sub.f32 %f27, %f26, %f25; mul.f32 %f28, %f27, %f27; fma.rn.f32 %f29, %f24, %f24, %f28; add.s64 %rd43, %rd2, %rd38; add.s64 %rd44, %rd1, %rd38; ld.global.nc.f32 %f30, [%rd44]; ld.global.nc.f32 %f31, [%rd43]; sub.f32 %f32, %f31, %f30; fma.rn.f32 %f33, %f32, %f32, %f29; max.f32 %f34, %f100, %f33; add.s64 %rd45, %rd39, %rd23; add.s64 %rd46, %rd40, %rd23; ld.global.nc.f32 %f35, [%rd46]; ld.global.nc.f32 %f36, [%rd45]; sub.f32 %f37, %f36, %f35; add.s64 %rd47, %rd41, %rd23; add.s64 %rd48, %rd42, %rd23; ld.global.nc.f32 %f38, [%rd48]; ld.global.nc.f32 %f39, [%rd47]; sub.f32 %f40, %f39, %f38; mul.f32 %f41, %f40, %f40; fma.rn.f32 %f42, %f37, %f37, %f41; add.s64 %rd49, %rd43, %rd23; add.s64 %rd50, %rd44, %rd23; ld.global.nc.f32 %f43, [%rd50]; ld.global.nc.f32 %f44, [%rd49]; sub.f32 %f45, %f44, %f43; fma.rn.f32 %f46, %f45, %f45, %f42; max.f32 %f47, %f34, %f46; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd51, %rd45, %rd23; add.s64 %rd52, %rd46, %rd23; ld.global.nc.f32 %f48, [%rd52]; ld.global.nc.f32 %f49, [%rd51]; sub.f32 %f50, %f49, %f48; add.s64 %rd53, %rd47, %rd23; add.s64 %rd54, %rd48, %rd23; ld.global.nc.f32 %f51, [%rd54]; ld.global.nc.f32 %f52, [%rd53]; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, %f53; fma.rn.f32 %f55, %f50, %f50, %f54; add.s64 %rd55, %rd49, %rd23; add.s64 %rd56, %rd50, %rd23; ld.global.nc.f32 %f56, [%rd56]; ld.global.nc.f32 %f57, [%rd55]; sub.f32 %f58, %f57, %f56; fma.rn.f32 %f59, %f58, %f58, %f55; max.f32 %f60, %f47, %f59; add.s32 %r27, %r26, %r4; add.s64 %rd57, %rd51, %rd23; add.s64 %rd58, %rd52, %rd23; ld.global.nc.f32 %f61, [%rd58]; ld.global.nc.f32 %f62, [%rd57]; sub.f32 %f63, %f62, %f61; add.s64 %rd59, %rd53, %rd23; add.s64 %rd60, %rd54, %rd23; ld.global.nc.f32 %f64, [%rd60]; ld.global.nc.f32 %f65, [%rd59]; sub.f32 %f66, %f65, %f64; mul.f32 %f67, %f66, %f66; fma.rn.f32 %f68, %f63, %f63, %f67; add.s64 %rd61, %rd55, %rd23; add.s64 %rd62, %rd56, %rd23; ld.global.nc.f32 %f69, [%rd62]; ld.global.nc.f32 %f70, [%rd61]; sub.f32 %f71, %f70, %f69; fma.rn.f32 %f72, %f71, %f71, %f68; max.f32 %f100, %f60, %f72; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f100; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f73, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f74, [%r31]; max.f32 %f75, %f73, %f74; st.shared.f32 [%r14], %f75; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f76, [%r14]; ld.volatile.shared.f32 %f77, [%r14+128]; max.f32 %f78, %f76, %f77; st.volatile.shared.f32 [%r14], %f78; ld.volatile.shared.f32 %f79, [%r14+64]; ld.volatile.shared.f32 %f80, [%r14]; max.f32 %f81, %f80, %f79; st.volatile.shared.f32 [%r14], %f81; ld.volatile.shared.f32 %f82, [%r14+32]; ld.volatile.shared.f32 %f83, [%r14]; max.f32 %f84, %f83, %f82; st.volatile.shared.f32 [%r14], %f84; ld.volatile.shared.f32 %f85, [%r14+16]; ld.volatile.shared.f32 %f86, [%r14]; max.f32 %f87, %f86, %f85; st.volatile.shared.f32 [%r14], %f87; ld.volatile.shared.f32 %f88, [%r14+8]; ld.volatile.shared.f32 %f89, [%r14]; max.f32 %f90, %f89, %f88; st.volatile.shared.f32 [%r14], %f90; ld.volatile.shared.f32 %f91, [%r14+4]; ld.volatile.shared.f32 %f92, [%r14]; max.f32 %f93, %f92, %f91; st.volatile.shared.f32 [%r14], %f93; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f94, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f95, %f94; cvta.to.global.u64 %rd63, %rd30; mov.b32 %r32, %f95; atom.global.max.s32 %r33, [%rd63], %r32; $L__BB0_15: ret; } ` ) 3-3.11.1/cuda/reducemaxvecnorm2.cu000066400000000000000000000005261503346766200166540ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #include "float3.h" #define load_vecnorm2(i) \ pow2(x[i]) + pow2(y[i]) + pow2(z[i]) extern "C" __global__ void reducemaxvecnorm2(float* __restrict__ x, float* __restrict__ y, float* __restrict__ z, float* __restrict__ dst, float initVal, int n) { reduce(load_vecnorm2, fmax, atomicFmaxabs) } 3-3.11.1/cuda/reducemaxvecnorm2_wrapper.go000066400000000000000000002310341503346766200204120ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxvecnorm2 kernel var reducemaxvecnorm2_code cu.Function // Stores the arguments for reducemaxvecnorm2 kernel invocation type reducemaxvecnorm2_args_t struct { arg_x unsafe.Pointer arg_y unsafe.Pointer arg_z unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxvecnorm2 kernel invocation var reducemaxvecnorm2_args reducemaxvecnorm2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxvecnorm2_args.argptr[0] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_x) reducemaxvecnorm2_args.argptr[1] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_y) reducemaxvecnorm2_args.argptr[2] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_z) reducemaxvecnorm2_args.argptr[3] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_dst) reducemaxvecnorm2_args.argptr[4] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_initVal) reducemaxvecnorm2_args.argptr[5] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_n) } // Wrapper for reducemaxvecnorm2 CUDA kernel, asynchronous. func k_reducemaxvecnorm2_async(x unsafe.Pointer, y unsafe.Pointer, z unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxvecnorm2") } reducemaxvecnorm2_args.Lock() defer reducemaxvecnorm2_args.Unlock() if reducemaxvecnorm2_code == 0 { reducemaxvecnorm2_code = fatbinLoad(reducemaxvecnorm2_map, "reducemaxvecnorm2") } reducemaxvecnorm2_args.arg_x = x reducemaxvecnorm2_args.arg_y = y reducemaxvecnorm2_args.arg_z = z reducemaxvecnorm2_args.arg_dst = dst reducemaxvecnorm2_args.arg_initVal = initVal reducemaxvecnorm2_args.arg_n = n args := reducemaxvecnorm2_args.argptr[:] cu.LaunchKernel(reducemaxvecnorm2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxvecnorm2") } } // maps compute capability on PTX code for reducemaxvecnorm2 kernel. var reducemaxvecnorm2_map = map[int]string{0: "", 50: reducemaxvecnorm2_ptx_50, 52: reducemaxvecnorm2_ptx_52, 53: reducemaxvecnorm2_ptx_53, 60: reducemaxvecnorm2_ptx_60, 61: reducemaxvecnorm2_ptx_61, 62: reducemaxvecnorm2_ptx_62, 70: reducemaxvecnorm2_ptx_70, 72: reducemaxvecnorm2_ptx_72, 75: reducemaxvecnorm2_ptx_75, 80: reducemaxvecnorm2_ptx_80, 86: reducemaxvecnorm2_ptx_86, 87: reducemaxvecnorm2_ptx_87, 89: reducemaxvecnorm2_ptx_89, 90: reducemaxvecnorm2_ptx_90} // reducemaxvecnorm2 PTX code for various compute capabilities. const ( reducemaxvecnorm2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` reducemaxvecnorm2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<11>; .reg .f32 %f<71>; .reg .b32 %r<39>; .reg .b64 %rd<37>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd16, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd17, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd18, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd15, [reducemaxvecnorm2_param_3]; ld.param.f32 %f70, [reducemaxvecnorm2_param_4]; ld.param.u32 %r17, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd18; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd16; mov.u32 %r38, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r36, %r18, %r38, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r38; setp.ge.s32 %p1, %r36, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r36, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r35, %r24, 3; setp.eq.s32 %p2, %r35, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd19, %r36, 4; add.s64 %rd36, %rd1, %rd19; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd35, %rd2, %rd19; add.s64 %rd34, %rd3, %rd19; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd34]; ld.global.nc.f32 %f11, [%rd35]; mul.f32 %f12, %f11, %f11; fma.rn.f32 %f13, %f10, %f10, %f12; ld.global.nc.f32 %f14, [%rd36]; fma.rn.f32 %f15, %f14, %f14, %f13; max.f32 %f70, %f70, %f15; add.s32 %r36, %r36, %r4; add.s64 %rd36, %rd36, %rd5; add.s64 %rd35, %rd35, %rd5; add.s64 %rd34, %rd34, %rd5; add.s32 %r35, %r35, -1; setp.ne.s32 %p3, %r35, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd14, %r4, 4; $L__BB0_6: mul.wide.s32 %rd20, %r36, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f16, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f17, [%rd22]; mul.f32 %f18, %f17, %f17; fma.rn.f32 %f19, %f16, %f16, %f18; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f20, %f19; max.f32 %f22, %f70, %f21; add.s64 %rd24, %rd21, %rd14; ld.global.nc.f32 %f23, [%rd24]; add.s64 %rd25, %rd22, %rd14; ld.global.nc.f32 %f24, [%rd25]; mul.f32 %f25, %f24, %f24; fma.rn.f32 %f26, %f23, %f23, %f25; add.s64 %rd26, %rd23, %rd14; ld.global.nc.f32 %f27, [%rd26]; fma.rn.f32 %f28, %f27, %f27, %f26; max.f32 %f29, %f22, %f28; add.s32 %r25, %r36, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd27, %rd24, %rd14; ld.global.nc.f32 %f30, [%rd27]; add.s64 %rd28, %rd25, %rd14; ld.global.nc.f32 %f31, [%rd28]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.s64 %rd29, %rd26, %rd14; ld.global.nc.f32 %f34, [%rd29]; fma.rn.f32 %f35, %f34, %f34, %f33; max.f32 %f36, %f29, %f35; add.s32 %r27, %r26, %r4; add.s64 %rd30, %rd27, %rd14; ld.global.nc.f32 %f37, [%rd30]; add.s64 %rd31, %rd28, %rd14; ld.global.nc.f32 %f38, [%rd31]; mul.f32 %f39, %f38, %f38; fma.rn.f32 %f40, %f37, %f37, %f39; add.s64 %rd32, %rd29, %rd14; ld.global.nc.f32 %f41, [%rd32]; fma.rn.f32 %f42, %f41, %f41, %f40; max.f32 %f70, %f36, %f42; add.s32 %r36, %r27, %r4; setp.lt.s32 %p5, %r36, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f70; bar.sync 0; setp.lt.u32 %p6, %r38, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r38, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f43, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f44, [%r31]; max.f32 %f45, %f43, %f44; st.shared.f32 [%r14], %f45; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r38, 131; mov.u32 %r38, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f46, [%r14]; ld.volatile.shared.f32 %f47, [%r14+128]; max.f32 %f48, %f46, %f47; st.volatile.shared.f32 [%r14], %f48; ld.volatile.shared.f32 %f49, [%r14+64]; ld.volatile.shared.f32 %f50, [%r14]; max.f32 %f51, %f50, %f49; st.volatile.shared.f32 [%r14], %f51; ld.volatile.shared.f32 %f52, [%r14+32]; ld.volatile.shared.f32 %f53, [%r14]; max.f32 %f54, %f53, %f52; st.volatile.shared.f32 [%r14], %f54; ld.volatile.shared.f32 %f55, [%r14+16]; ld.volatile.shared.f32 %f56, [%r14]; max.f32 %f57, %f56, %f55; st.volatile.shared.f32 [%r14], %f57; ld.volatile.shared.f32 %f58, [%r14+8]; ld.volatile.shared.f32 %f59, [%r14]; max.f32 %f60, %f59, %f58; st.volatile.shared.f32 [%r14], %f60; ld.volatile.shared.f32 %f61, [%r14+4]; ld.volatile.shared.f32 %f62, [%r14]; max.f32 %f63, %f62, %f61; st.volatile.shared.f32 [%r14], %f63; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f64, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f65, %f64; mov.b32 %r32, %f65; cvta.to.global.u64 %rd33, %rd15; atom.global.max.s32 %r33, [%rd33], %r32; $L__BB0_15: ret; } ` ) 3-3.11.1/cuda/reducesum.cu000066400000000000000000000003211503346766200152100ustar00rootroot00000000000000#include "reduce.h" #include "sum.h" #define load(i) src[i] extern "C" __global__ void reducesum(float* __restrict__ src, float*__restrict__ dst, float initVal, int n) { reduce(load, sum, atomicAdd) } 3-3.11.1/cuda/reducesum_wrapper.go000066400000000000000000001527661503346766200167730ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducesum kernel var reducesum_code cu.Function // Stores the arguments for reducesum kernel invocation type reducesum_args_t struct { arg_src unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for reducesum kernel invocation var reducesum_args reducesum_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducesum_args.argptr[0] = unsafe.Pointer(&reducesum_args.arg_src) reducesum_args.argptr[1] = unsafe.Pointer(&reducesum_args.arg_dst) reducesum_args.argptr[2] = unsafe.Pointer(&reducesum_args.arg_initVal) reducesum_args.argptr[3] = unsafe.Pointer(&reducesum_args.arg_n) } // Wrapper for reducesum CUDA kernel, asynchronous. func k_reducesum_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducesum") } reducesum_args.Lock() defer reducesum_args.Unlock() if reducesum_code == 0 { reducesum_code = fatbinLoad(reducesum_map, "reducesum") } reducesum_args.arg_src = src reducesum_args.arg_dst = dst reducesum_args.arg_initVal = initVal reducesum_args.arg_n = n args := reducesum_args.argptr[:] cu.LaunchKernel(reducesum_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducesum") } } // maps compute capability on PTX code for reducesum kernel. var reducesum_map = map[int]string{0: "", 50: reducesum_ptx_50, 52: reducesum_ptx_52, 53: reducesum_ptx_53, 60: reducesum_ptx_60, 61: reducesum_ptx_61, 62: reducesum_ptx_62, 70: reducesum_ptx_70, 72: reducesum_ptx_72, 75: reducesum_ptx_75, 80: reducesum_ptx_80, 86: reducesum_ptx_86, 87: reducesum_ptx_87, 89: reducesum_ptx_89, 90: reducesum_ptx_90} // reducesum PTX code for various compute capabilities. const ( reducesum_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` reducesum_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<11>; .reg .f32 %f<46>; .reg .b32 %r<37>; .reg .b64 %rd<17>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd8, [reducesum_param_0]; ld.param.u64 %rd7, [reducesum_param_1]; ld.param.f32 %f45, [reducesum_param_2]; ld.param.u32 %r17, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd8; mov.u32 %r36, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r34, %r18, %r36, %r2; mov.u32 %r19, %nctaid.x; mul.lo.s32 %r4, %r19, %r36; setp.ge.s32 %p1, %r34, %r17; @%p1 bra $L__BB0_7; add.s32 %r20, %r4, %r17; add.s32 %r21, %r34, %r4; not.b32 %r22, %r21; add.s32 %r23, %r20, %r22; div.u32 %r5, %r23, %r4; add.s32 %r24, %r5, 1; and.b32 %r33, %r24, 3; setp.eq.s32 %p2, %r33, 0; @%p2 bra $L__BB0_4; mul.wide.s32 %rd9, %r34, 4; add.s64 %rd16, %rd1, %rd9; mul.wide.s32 %rd3, %r4, 4; $L__BB0_3: .pragma "nounroll"; ld.global.nc.f32 %f10, [%rd16]; add.f32 %f45, %f45, %f10; add.s32 %r34, %r34, %r4; add.s64 %rd16, %rd16, %rd3; add.s32 %r33, %r33, -1; setp.ne.s32 %p3, %r33, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.lt.u32 %p4, %r5, 3; @%p4 bra $L__BB0_7; mul.wide.s32 %rd6, %r4, 4; $L__BB0_6: mul.wide.s32 %rd10, %r34, 4; add.s64 %rd11, %rd1, %rd10; ld.global.nc.f32 %f11, [%rd11]; add.f32 %f12, %f45, %f11; add.s64 %rd12, %rd11, %rd6; ld.global.nc.f32 %f13, [%rd12]; add.f32 %f14, %f12, %f13; add.s32 %r25, %r34, %r4; add.s32 %r26, %r25, %r4; add.s64 %rd13, %rd12, %rd6; ld.global.nc.f32 %f15, [%rd13]; add.f32 %f16, %f14, %f15; add.s32 %r27, %r26, %r4; add.s64 %rd14, %rd13, %rd6; ld.global.nc.f32 %f17, [%rd14]; add.f32 %f45, %f16, %f17; add.s32 %r34, %r27, %r4; setp.lt.s32 %p5, %r34, %r17; @%p5 bra $L__BB0_6; $L__BB0_7: shl.b32 %r28, %r2, 2; mov.u32 %r29, _ZZ9reducesumE5sdata; add.s32 %r14, %r29, %r28; st.shared.f32 [%r14], %f45; bar.sync 0; setp.lt.u32 %p6, %r36, 66; @%p6 bra $L__BB0_11; $L__BB0_8: shr.u32 %r16, %r36, 1; setp.ge.u32 %p7, %r2, %r16; @%p7 bra $L__BB0_10; ld.shared.f32 %f18, [%r14]; shl.b32 %r30, %r16, 2; add.s32 %r31, %r14, %r30; ld.shared.f32 %f19, [%r31]; add.f32 %f20, %f18, %f19; st.shared.f32 [%r14], %f20; $L__BB0_10: bar.sync 0; setp.gt.u32 %p8, %r36, 131; mov.u32 %r36, %r16; @%p8 bra $L__BB0_8; $L__BB0_11: setp.gt.s32 %p9, %r2, 31; @%p9 bra $L__BB0_13; ld.volatile.shared.f32 %f21, [%r14]; ld.volatile.shared.f32 %f22, [%r14+128]; add.f32 %f23, %f21, %f22; st.volatile.shared.f32 [%r14], %f23; ld.volatile.shared.f32 %f24, [%r14+64]; ld.volatile.shared.f32 %f25, [%r14]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r14], %f26; ld.volatile.shared.f32 %f27, [%r14+32]; ld.volatile.shared.f32 %f28, [%r14]; add.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r14], %f29; ld.volatile.shared.f32 %f30, [%r14+16]; ld.volatile.shared.f32 %f31, [%r14]; add.f32 %f32, %f31, %f30; st.volatile.shared.f32 [%r14], %f32; ld.volatile.shared.f32 %f33, [%r14+8]; ld.volatile.shared.f32 %f34, [%r14]; add.f32 %f35, %f34, %f33; st.volatile.shared.f32 [%r14], %f35; ld.volatile.shared.f32 %f36, [%r14+4]; ld.volatile.shared.f32 %f37, [%r14]; add.f32 %f38, %f37, %f36; st.volatile.shared.f32 [%r14], %f38; $L__BB0_13: setp.ne.s32 %p10, %r2, 0; @%p10 bra $L__BB0_15; ld.shared.f32 %f39, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd15, %rd7; atom.global.add.f32 %f40, [%rd15], %f39; $L__BB0_15: ret; } ` ) 3-3.11.1/cuda/region.go000066400000000000000000000024221503346766200145010ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // dst += LUT[region], for vectors. Used to add terms to excitation. func RegionAddV(dst *data.Slice, lut LUTPtrs, regions *Bytes) { util.Argument(dst.NComp() == 3) N := dst.Len() cfg := make1DConf(N) k_regionaddv_async(dst.DevPtr(X), dst.DevPtr(Y), dst.DevPtr(Z), lut[X], lut[Y], lut[Z], regions.Ptr, N, cfg) } // dst += LUT[region], for scalar. Used to add terms to scalar excitation. func RegionAddS(dst *data.Slice, lut LUTPtr, regions *Bytes) { util.Argument(dst.NComp() == 1) N := dst.Len() cfg := make1DConf(N) k_regionadds_async(dst.DevPtr(0), unsafe.Pointer(lut), regions.Ptr, N, cfg) } // decode the regions+LUT pair into an uncompressed array func RegionDecode(dst *data.Slice, lut LUTPtr, regions *Bytes) { N := dst.Len() cfg := make1DConf(N) k_regiondecode_async(dst.DevPtr(0), unsafe.Pointer(lut), regions.Ptr, N, cfg) } // select the part of src within the specified region, set 0's everywhere else. func RegionSelect(dst, src *data.Slice, regions *Bytes, region byte) { util.Argument(dst.NComp() == src.NComp()) N := dst.Len() cfg := make1DConf(N) for c := 0; c < dst.NComp(); c++ { k_regionselect_async(dst.DevPtr(c), src.DevPtr(c), regions.Ptr, region, N, cfg) } } 3-3.11.1/cuda/regionadds.cu000066400000000000000000000005551503346766200153440ustar00rootroot00000000000000#include // add region-based scalar to dst: // dst[i] += LUT[region[i]] extern "C" __global__ void regionadds(float* __restrict__ dst, float* __restrict__ LUT, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { uint8_t r = regions[i]; dst[i] += LUT[r]; } } 3-3.11.1/cuda/regionadds_wrapper.go000066400000000000000000000467021503346766200171060ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regionadds kernel var regionadds_code cu.Function // Stores the arguments for regionadds kernel invocation type regionadds_args_t struct { arg_dst unsafe.Pointer arg_LUT unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for regionadds kernel invocation var regionadds_args regionadds_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regionadds_args.argptr[0] = unsafe.Pointer(®ionadds_args.arg_dst) regionadds_args.argptr[1] = unsafe.Pointer(®ionadds_args.arg_LUT) regionadds_args.argptr[2] = unsafe.Pointer(®ionadds_args.arg_regions) regionadds_args.argptr[3] = unsafe.Pointer(®ionadds_args.arg_N) } // Wrapper for regionadds CUDA kernel, asynchronous. func k_regionadds_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regionadds") } regionadds_args.Lock() defer regionadds_args.Unlock() if regionadds_code == 0 { regionadds_code = fatbinLoad(regionadds_map, "regionadds") } regionadds_args.arg_dst = dst regionadds_args.arg_LUT = LUT regionadds_args.arg_regions = regions regionadds_args.arg_N = N args := regionadds_args.argptr[:] cu.LaunchKernel(regionadds_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regionadds") } } // maps compute capability on PTX code for regionadds kernel. var regionadds_map = map[int]string{0: "", 50: regionadds_ptx_50, 52: regionadds_ptx_52, 53: regionadds_ptx_53, 60: regionadds_ptx_60, 61: regionadds_ptx_61, 62: regionadds_ptx_62, 70: regionadds_ptx_70, 72: regionadds_ptx_72, 75: regionadds_ptx_75, 80: regionadds_ptx_80, 86: regionadds_ptx_86, 87: regionadds_ptx_87, 89: regionadds_ptx_89, 90: regionadds_ptx_90} // regionadds PTX code for various compute capabilities. const ( regionadds_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` regionadds_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/regionaddv.cu000066400000000000000000000010461503346766200153430ustar00rootroot00000000000000#include // add region-based vector to dst: // dst[i] += LUT[region[i]] extern "C" __global__ void regionaddv(float* __restrict__ dstx, float* __restrict__ dsty, float* __restrict__ dstz, float* __restrict__ LUTx, float* __restrict__ LUTy, float* __restrict__ LUTz, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { uint8_t r = regions[i]; dstx[i] += LUTx[r]; dsty[i] += LUTy[r]; dstz[i] += LUTz[r]; } } 3-3.11.1/cuda/regionaddv_wrapper.go000066400000000000000000000761541503346766200171150ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regionaddv kernel var regionaddv_code cu.Function // Stores the arguments for regionaddv kernel invocation type regionaddv_args_t struct { arg_dstx unsafe.Pointer arg_dsty unsafe.Pointer arg_dstz unsafe.Pointer arg_LUTx unsafe.Pointer arg_LUTy unsafe.Pointer arg_LUTz unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for regionaddv kernel invocation var regionaddv_args regionaddv_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regionaddv_args.argptr[0] = unsafe.Pointer(®ionaddv_args.arg_dstx) regionaddv_args.argptr[1] = unsafe.Pointer(®ionaddv_args.arg_dsty) regionaddv_args.argptr[2] = unsafe.Pointer(®ionaddv_args.arg_dstz) regionaddv_args.argptr[3] = unsafe.Pointer(®ionaddv_args.arg_LUTx) regionaddv_args.argptr[4] = unsafe.Pointer(®ionaddv_args.arg_LUTy) regionaddv_args.argptr[5] = unsafe.Pointer(®ionaddv_args.arg_LUTz) regionaddv_args.argptr[6] = unsafe.Pointer(®ionaddv_args.arg_regions) regionaddv_args.argptr[7] = unsafe.Pointer(®ionaddv_args.arg_N) } // Wrapper for regionaddv CUDA kernel, asynchronous. func k_regionaddv_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, LUTx unsafe.Pointer, LUTy unsafe.Pointer, LUTz unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regionaddv") } regionaddv_args.Lock() defer regionaddv_args.Unlock() if regionaddv_code == 0 { regionaddv_code = fatbinLoad(regionaddv_map, "regionaddv") } regionaddv_args.arg_dstx = dstx regionaddv_args.arg_dsty = dsty regionaddv_args.arg_dstz = dstz regionaddv_args.arg_LUTx = LUTx regionaddv_args.arg_LUTy = LUTy regionaddv_args.arg_LUTz = LUTz regionaddv_args.arg_regions = regions regionaddv_args.arg_N = N args := regionaddv_args.argptr[:] cu.LaunchKernel(regionaddv_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regionaddv") } } // maps compute capability on PTX code for regionaddv kernel. var regionaddv_map = map[int]string{0: "", 50: regionaddv_ptx_50, 52: regionaddv_ptx_52, 53: regionaddv_ptx_53, 60: regionaddv_ptx_60, 61: regionaddv_ptx_61, 62: regionaddv_ptx_62, 70: regionaddv_ptx_70, 72: regionaddv_ptx_72, 75: regionaddv_ptx_75, 80: regionaddv_ptx_80, 86: regionaddv_ptx_86, 87: regionaddv_ptx_87, 89: regionaddv_ptx_89, 90: regionaddv_ptx_90} // regionaddv PTX code for various compute capabilities. const ( regionaddv_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` regionaddv_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/regiondecode.cu000066400000000000000000000005231503346766200156470ustar00rootroot00000000000000#include // decode the regions+LUT pair into an uncompressed array extern "C" __global__ void regiondecode(float* __restrict__ dst, float* __restrict__ LUT, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { dst[i] = LUT[regions[i]]; } } 3-3.11.1/cuda/regiondecode_wrapper.go000066400000000000000000000461241503346766200174140ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regiondecode kernel var regiondecode_code cu.Function // Stores the arguments for regiondecode kernel invocation type regiondecode_args_t struct { arg_dst unsafe.Pointer arg_LUT unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for regiondecode kernel invocation var regiondecode_args regiondecode_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regiondecode_args.argptr[0] = unsafe.Pointer(®iondecode_args.arg_dst) regiondecode_args.argptr[1] = unsafe.Pointer(®iondecode_args.arg_LUT) regiondecode_args.argptr[2] = unsafe.Pointer(®iondecode_args.arg_regions) regiondecode_args.argptr[3] = unsafe.Pointer(®iondecode_args.arg_N) } // Wrapper for regiondecode CUDA kernel, asynchronous. func k_regiondecode_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regiondecode") } regiondecode_args.Lock() defer regiondecode_args.Unlock() if regiondecode_code == 0 { regiondecode_code = fatbinLoad(regiondecode_map, "regiondecode") } regiondecode_args.arg_dst = dst regiondecode_args.arg_LUT = LUT regiondecode_args.arg_regions = regions regiondecode_args.arg_N = N args := regiondecode_args.argptr[:] cu.LaunchKernel(regiondecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regiondecode") } } // maps compute capability on PTX code for regiondecode kernel. var regiondecode_map = map[int]string{0: "", 50: regiondecode_ptx_50, 52: regiondecode_ptx_52, 53: regiondecode_ptx_53, 60: regiondecode_ptx_60, 61: regiondecode_ptx_61, 62: regiondecode_ptx_62, 70: regiondecode_ptx_70, 72: regiondecode_ptx_72, 75: regiondecode_ptx_75, 80: regiondecode_ptx_80, 86: regiondecode_ptx_86, 87: regiondecode_ptx_87, 89: regiondecode_ptx_89, 90: regiondecode_ptx_90} // regiondecode PTX code for various compute capabilities. const ( regiondecode_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` regiondecode_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/regionselect.cu000066400000000000000000000004731503346766200157070ustar00rootroot00000000000000#include extern "C" __global__ void regionselect(float* __restrict__ dst, float* __restrict__ src, uint8_t* regions, uint8_t region, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { dst[i] = (regions[i] == region? src[i]: 0.0f); } } 3-3.11.1/cuda/regionselect_wrapper.go000066400000000000000000000513351503346766200174500ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regionselect kernel var regionselect_code cu.Function // Stores the arguments for regionselect kernel invocation type regionselect_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_regions unsafe.Pointer arg_region byte arg_N int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for regionselect kernel invocation var regionselect_args regionselect_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regionselect_args.argptr[0] = unsafe.Pointer(®ionselect_args.arg_dst) regionselect_args.argptr[1] = unsafe.Pointer(®ionselect_args.arg_src) regionselect_args.argptr[2] = unsafe.Pointer(®ionselect_args.arg_regions) regionselect_args.argptr[3] = unsafe.Pointer(®ionselect_args.arg_region) regionselect_args.argptr[4] = unsafe.Pointer(®ionselect_args.arg_N) } // Wrapper for regionselect CUDA kernel, asynchronous. func k_regionselect_async(dst unsafe.Pointer, src unsafe.Pointer, regions unsafe.Pointer, region byte, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regionselect") } regionselect_args.Lock() defer regionselect_args.Unlock() if regionselect_code == 0 { regionselect_code = fatbinLoad(regionselect_map, "regionselect") } regionselect_args.arg_dst = dst regionselect_args.arg_src = src regionselect_args.arg_regions = regions regionselect_args.arg_region = region regionselect_args.arg_N = N args := regionselect_args.argptr[:] cu.LaunchKernel(regionselect_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regionselect") } } // maps compute capability on PTX code for regionselect kernel. var regionselect_map = map[int]string{0: "", 50: regionselect_ptx_50, 52: regionselect_ptx_52, 53: regionselect_ptx_53, 60: regionselect_ptx_60, 61: regionselect_ptx_61, 62: regionselect_ptx_62, 70: regionselect_ptx_70, 72: regionselect_ptx_72, 75: regionselect_ptx_75, 80: regionselect_ptx_80, 86: regionselect_ptx_86, 87: regionselect_ptx_87, 89: regionselect_ptx_89, 90: regionselect_ptx_90} // regionselect PTX code for various compute capabilities. const ( regionselect_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` regionselect_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u8 %rs1, [regionselect_param_3]; ld.param.u64 %rd2, [regionselect_param_0]; ld.param.u64 %rd3, [regionselect_param_1]; ld.param.u64 %rd4, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs2, [%rd6]; setp.ne.s16 %p2, %rs2, %rs1; mov.f32 %f4, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 2; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; $L__BB0_3: cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; $L__BB0_4: ret; } ` ) 3-3.11.1/cuda/resize.cu000066400000000000000000000014321503346766200145210ustar00rootroot00000000000000 // Select and resize one layer for interactive output extern "C" __global__ void resize(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz, int layer, int scalex, int scaley) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix 0 && scaley > 0) cfg := make3DConf(dstsize) k_resize_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], layer, scalex, scaley, cfg) } 3-3.11.1/cuda/resize_wrapper.go000066400000000000000000001707131503346766200162700ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for resize kernel var resize_code cu.Function // Stores the arguments for resize kernel invocation type resize_args_t struct { arg_dst unsafe.Pointer arg_Dx int arg_Dy int arg_Dz int arg_src unsafe.Pointer arg_Sx int arg_Sy int arg_Sz int arg_layer int arg_scalex int arg_scaley int argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for resize kernel invocation var resize_args resize_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. resize_args.argptr[0] = unsafe.Pointer(&resize_args.arg_dst) resize_args.argptr[1] = unsafe.Pointer(&resize_args.arg_Dx) resize_args.argptr[2] = unsafe.Pointer(&resize_args.arg_Dy) resize_args.argptr[3] = unsafe.Pointer(&resize_args.arg_Dz) resize_args.argptr[4] = unsafe.Pointer(&resize_args.arg_src) resize_args.argptr[5] = unsafe.Pointer(&resize_args.arg_Sx) resize_args.argptr[6] = unsafe.Pointer(&resize_args.arg_Sy) resize_args.argptr[7] = unsafe.Pointer(&resize_args.arg_Sz) resize_args.argptr[8] = unsafe.Pointer(&resize_args.arg_layer) resize_args.argptr[9] = unsafe.Pointer(&resize_args.arg_scalex) resize_args.argptr[10] = unsafe.Pointer(&resize_args.arg_scaley) } // Wrapper for resize CUDA kernel, asynchronous. func k_resize_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, layer int, scalex int, scaley int, cfg *config) { if Synchronous { // debug Sync() timer.Start("resize") } resize_args.Lock() defer resize_args.Unlock() if resize_code == 0 { resize_code = fatbinLoad(resize_map, "resize") } resize_args.arg_dst = dst resize_args.arg_Dx = Dx resize_args.arg_Dy = Dy resize_args.arg_Dz = Dz resize_args.arg_src = src resize_args.arg_Sx = Sx resize_args.arg_Sy = Sy resize_args.arg_Sz = Sz resize_args.arg_layer = layer resize_args.arg_scalex = scalex resize_args.arg_scaley = scaley args := resize_args.argptr[:] cu.LaunchKernel(resize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("resize") } } // maps compute capability on PTX code for resize kernel. var resize_map = map[int]string{0: "", 50: resize_ptx_50, 52: resize_ptx_52, 53: resize_ptx_53, 60: resize_ptx_60, 61: resize_ptx_61, 62: resize_ptx_62, 70: resize_ptx_70, 72: resize_ptx_72, 75: resize_ptx_75, 80: resize_ptx_80, 86: resize_ptx_86, 87: resize_ptx_87, 89: resize_ptx_89, 90: resize_ptx_90} // resize PTX code for various compute capabilities. const ( resize_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` resize_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<80>; .reg .b32 %r<49>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r20, [resize_param_1]; ld.param.u32 %r26, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r21, [resize_param_5]; ld.param.u32 %r22, [resize_param_6]; ld.param.u32 %r23, [resize_param_8]; ld.param.u32 %r24, [resize_param_9]; ld.param.u32 %r25, [resize_param_10]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r27, %ntid.x; mov.u32 %r28, %ctaid.x; mov.u32 %r29, %tid.x; mad.lo.s32 %r1, %r28, %r27, %r29; mov.u32 %r30, %ntid.y; mov.u32 %r31, %ctaid.y; mov.u32 %r32, %tid.y; mad.lo.s32 %r2, %r31, %r30, %r32; setp.ge.s32 %p1, %r1, %r20; setp.ge.s32 %p2, %r2, %r26; or.pred %p3, %p1, %p2; mov.f32 %f60, 0f00000000; mov.f32 %f61, 0f00000000; @%p3 bra $L__BB0_26; setp.lt.s32 %p4, %r25, 1; @%p4 bra $L__BB0_25; mul.lo.s32 %r3, %r1, %r24; setp.lt.s32 %p5, %r24, 1; @%p5 bra $L__BB0_25; add.s32 %r4, %r24, -1; and.b32 %r5, %r24, 3; sub.s32 %r6, %r24, %r5; mul.lo.s32 %r7, %r23, %r22; mul.lo.s32 %r8, %r2, %r25; mov.f32 %f61, 0f00000000; mov.u32 %r33, 0; mov.u32 %r45, %r33; mov.f32 %f60, %f61; $L__BB0_4: add.s32 %r10, %r45, %r8; add.s32 %r35, %r10, %r7; mul.lo.s32 %r11, %r35, %r21; setp.lt.u32 %p6, %r4, 3; mov.u32 %r48, %r33; @%p6 bra $L__BB0_15; mov.u32 %r48, 0; mov.u32 %r47, %r6; $L__BB0_6: add.s32 %r14, %r48, %r3; setp.ge.s32 %p7, %r14, %r21; setp.ge.s32 %p8, %r10, %r22; add.s32 %r37, %r14, %r11; mul.wide.s32 %rd6, %r37, 4; add.s64 %rd2, %rd1, %rd6; or.pred %p9, %p8, %p7; @%p9 bra $L__BB0_8; ld.global.nc.f32 %f46, [%rd2]; add.f32 %f60, %f60, %f46; add.f32 %f61, %f61, 0f3F800000; $L__BB0_8: add.s32 %r38, %r14, 1; setp.ge.s32 %p10, %r38, %r21; or.pred %p12, %p8, %p10; @%p12 bra $L__BB0_10; ld.global.nc.f32 %f47, [%rd2+4]; add.f32 %f60, %f60, %f47; add.f32 %f61, %f61, 0f3F800000; $L__BB0_10: add.s32 %r39, %r14, 2; setp.ge.s32 %p13, %r39, %r21; or.pred %p15, %p8, %p13; @%p15 bra $L__BB0_12; ld.global.nc.f32 %f48, [%rd2+8]; add.f32 %f60, %f60, %f48; add.f32 %f61, %f61, 0f3F800000; $L__BB0_12: add.s32 %r40, %r14, 3; setp.ge.s32 %p16, %r40, %r21; or.pred %p18, %p8, %p16; @%p18 bra $L__BB0_14; ld.global.nc.f32 %f49, [%rd2+12]; add.f32 %f60, %f60, %f49; add.f32 %f61, %f61, 0f3F800000; $L__BB0_14: add.s32 %r48, %r48, 4; add.s32 %r47, %r47, -4; setp.ne.s32 %p19, %r47, 0; @%p19 bra $L__BB0_6; $L__BB0_15: setp.eq.s32 %p20, %r5, 0; @%p20 bra $L__BB0_24; setp.ge.s32 %p21, %r10, %r22; add.s32 %r18, %r48, %r3; setp.ge.s32 %p22, %r18, %r21; add.s32 %r41, %r18, %r11; mul.wide.s32 %rd7, %r41, 4; add.s64 %rd3, %rd1, %rd7; or.pred %p23, %p21, %p22; @%p23 bra $L__BB0_18; ld.global.nc.f32 %f50, [%rd3]; add.f32 %f60, %f60, %f50; add.f32 %f61, %f61, 0f3F800000; $L__BB0_18: setp.eq.s32 %p24, %r5, 1; @%p24 bra $L__BB0_24; add.s32 %r42, %r18, 1; setp.ge.s32 %p26, %r42, %r21; or.pred %p27, %p21, %p26; @%p27 bra $L__BB0_21; ld.global.nc.f32 %f51, [%rd3+4]; add.f32 %f60, %f60, %f51; add.f32 %f61, %f61, 0f3F800000; $L__BB0_21: setp.eq.s32 %p28, %r5, 2; @%p28 bra $L__BB0_24; add.s32 %r43, %r18, 2; setp.ge.s32 %p30, %r43, %r21; or.pred %p31, %p21, %p30; @%p31 bra $L__BB0_24; ld.global.nc.f32 %f52, [%rd3+8]; add.f32 %f60, %f60, %f52; add.f32 %f61, %f61, 0f3F800000; $L__BB0_24: add.s32 %r45, %r45, 1; setp.lt.s32 %p32, %r45, %r25; @%p32 bra $L__BB0_4; $L__BB0_25: mad.lo.s32 %r44, %r2, %r20, %r1; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd8, %rd9; div.rn.f32 %f53, %f60, %f61; st.global.f32 [%rd10], %f53; $L__BB0_26: ret; } ` ) 3-3.11.1/cuda/shift.go000066400000000000000000000071731503346766200143430ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // shift dst by shx cells (positive or negative) along X-axis. // new edge value is clampL at left edge (-X) or clampR at right edge (+X). func ShiftX(dst, src *data.Slice, shiftX int, clampL, clampR float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shiftx_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg) } // Shifts a component `src` of a vector field by `shiftX` cells along the X-axis. // Unlike the normal `shift()`, the new edge value is the current edge value. // // To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is // also required to pass the two other vector components `othercomp` and `anothercomp` to this function. // In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, // `clampL` or `clampR` is used for the component `src` instead. func ShiftEdgeCarryX(dst, src, othercomp, anothercomp *data.Slice, shiftX int, clampL, clampR float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1 && othercomp.NComp() == 1 && anothercomp.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shiftedgecarryX_async(dst.DevPtr(0), src.DevPtr(0), othercomp.DevPtr(0), anothercomp.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg) } // shift dst by shy cells (positive or negative) along Y-axis. // new edge value is clampD at bottom edge (-Y) or clampU at top edge (+Y) func ShiftY(dst, src *data.Slice, shiftY int, clampD, clampU float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shifty_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftY, clampD, clampU, cfg) } // Shifts a component `src` of a vector field by `shiftY` cells along the Y-axis. // Unlike the normal `shift()`, the new edge value is the current edge value. // // To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is // also required to pass the two other vector components `othercomp` and `anothercomp` to this function. // In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, // `clampD` or `clampU` is used for the component `src` instead. func ShiftEdgeCarryY(dst, src, othercomp, anothercomp *data.Slice, shiftY int, clampD, clampU float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1 && othercomp.NComp() == 1 && anothercomp.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shiftedgecarryY_async(dst.DevPtr(0), src.DevPtr(0), othercomp.DevPtr(0), anothercomp.DevPtr(0), N[X], N[Y], N[Z], shiftY, clampD, clampU, cfg) } // shift dst by shz cells (positive or negative) along Z-axis. // new edge value is clampB at back edge (-Z) or clampF at front edge (+Z). func ShiftZ(dst, src *data.Slice, shiftZ int, clampB, clampF float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shiftz_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftZ, clampB, clampF, cfg) } // Like Shift, but for bytes func ShiftBytes(dst, src *Bytes, m *data.Mesh, shiftX int, clamp byte) { N := m.Size() cfg := make3DConf(N) k_shiftbytes_async(dst.Ptr, src.Ptr, N[X], N[Y], N[Z], shiftX, clamp, cfg) } func ShiftBytesY(dst, src *Bytes, m *data.Mesh, shiftY int, clamp byte) { N := m.Size() cfg := make3DConf(N) k_shiftbytesy_async(dst.Ptr, src.Ptr, N[X], N[Y], N[Z], shiftY, clamp, cfg) } 3-3.11.1/cuda/shiftbytes.cu000066400000000000000000000013211503346766200154010ustar00rootroot00000000000000#include #include "stencil.h" // shift dst by shx cells (positive or negative) along X-axis. // new edge value is clamp. extern "C" __global__ void shiftbytes(uint8_t* __restrict__ dst, uint8_t* __restrict__ src, int Nx, int Ny, int Nz, int shx, uint8_t clamp) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int ix2 = ix-shx; uint8_t newval; if (ix2 < 0 || ix2 >= Nx) { newval = clamp; } else { newval = src[idx(ix2, iy, iz)]; } dst[idx(ix, iy, iz)] = newval; } } 3-3.11.1/cuda/shiftbytes_wrapper.go000066400000000000000000000644271503346766200171570ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftbytes kernel var shiftbytes_code cu.Function // Stores the arguments for shiftbytes kernel invocation type shiftbytes_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shx int arg_clamp byte argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftbytes kernel invocation var shiftbytes_args shiftbytes_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftbytes_args.argptr[0] = unsafe.Pointer(&shiftbytes_args.arg_dst) shiftbytes_args.argptr[1] = unsafe.Pointer(&shiftbytes_args.arg_src) shiftbytes_args.argptr[2] = unsafe.Pointer(&shiftbytes_args.arg_Nx) shiftbytes_args.argptr[3] = unsafe.Pointer(&shiftbytes_args.arg_Ny) shiftbytes_args.argptr[4] = unsafe.Pointer(&shiftbytes_args.arg_Nz) shiftbytes_args.argptr[5] = unsafe.Pointer(&shiftbytes_args.arg_shx) shiftbytes_args.argptr[6] = unsafe.Pointer(&shiftbytes_args.arg_clamp) } // Wrapper for shiftbytes CUDA kernel, asynchronous. func k_shiftbytes_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clamp byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftbytes") } shiftbytes_args.Lock() defer shiftbytes_args.Unlock() if shiftbytes_code == 0 { shiftbytes_code = fatbinLoad(shiftbytes_map, "shiftbytes") } shiftbytes_args.arg_dst = dst shiftbytes_args.arg_src = src shiftbytes_args.arg_Nx = Nx shiftbytes_args.arg_Ny = Ny shiftbytes_args.arg_Nz = Nz shiftbytes_args.arg_shx = shx shiftbytes_args.arg_clamp = clamp args := shiftbytes_args.argptr[:] cu.LaunchKernel(shiftbytes_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftbytes") } } // maps compute capability on PTX code for shiftbytes kernel. var shiftbytes_map = map[int]string{0: "", 50: shiftbytes_ptx_50, 52: shiftbytes_ptx_52, 53: shiftbytes_ptx_53, 60: shiftbytes_ptx_60, 61: shiftbytes_ptx_61, 62: shiftbytes_ptx_62, 70: shiftbytes_ptx_70, 72: shiftbytes_ptx_72, 75: shiftbytes_ptx_75, 80: shiftbytes_ptx_80, 86: shiftbytes_ptx_86, 87: shiftbytes_ptx_87, 89: shiftbytes_ptx_89, 90: shiftbytes_ptx_90} // shiftbytes PTX code for various compute capabilities. const ( shiftbytes_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytes_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytes_param_6]; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra $L__BB0_3; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd6, %r21; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` ) 3-3.11.1/cuda/shiftbytesy.cu000066400000000000000000000013231503346766200155740ustar00rootroot00000000000000#include #include "stencil.h" // shift dst by shy cells (positive or negative) along Y-axis. // new edge value is clamp. extern "C" __global__ void shiftbytesy(uint8_t* __restrict__ dst, uint8_t* __restrict__ src, int Nx, int Ny, int Nz, int shy, uint8_t clamp) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int iy2 = iy-shy; uint8_t newval; if (iy2 < 0 || iy2 >= Ny) { newval = clamp; } else { newval = src[idx(ix, iy2, iz)]; } dst[idx(ix, iy, iz)] = newval; } } 3-3.11.1/cuda/shiftbytesy_wrapper.go000066400000000000000000000660501503346766200173420ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftbytesy kernel var shiftbytesy_code cu.Function // Stores the arguments for shiftbytesy kernel invocation type shiftbytesy_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shy int arg_clamp byte argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftbytesy kernel invocation var shiftbytesy_args shiftbytesy_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftbytesy_args.argptr[0] = unsafe.Pointer(&shiftbytesy_args.arg_dst) shiftbytesy_args.argptr[1] = unsafe.Pointer(&shiftbytesy_args.arg_src) shiftbytesy_args.argptr[2] = unsafe.Pointer(&shiftbytesy_args.arg_Nx) shiftbytesy_args.argptr[3] = unsafe.Pointer(&shiftbytesy_args.arg_Ny) shiftbytesy_args.argptr[4] = unsafe.Pointer(&shiftbytesy_args.arg_Nz) shiftbytesy_args.argptr[5] = unsafe.Pointer(&shiftbytesy_args.arg_shy) shiftbytesy_args.argptr[6] = unsafe.Pointer(&shiftbytesy_args.arg_clamp) } // Wrapper for shiftbytesy CUDA kernel, asynchronous. func k_shiftbytesy_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clamp byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftbytesy") } shiftbytesy_args.Lock() defer shiftbytesy_args.Unlock() if shiftbytesy_code == 0 { shiftbytesy_code = fatbinLoad(shiftbytesy_map, "shiftbytesy") } shiftbytesy_args.arg_dst = dst shiftbytesy_args.arg_src = src shiftbytesy_args.arg_Nx = Nx shiftbytesy_args.arg_Ny = Ny shiftbytesy_args.arg_Nz = Nz shiftbytesy_args.arg_shy = shy shiftbytesy_args.arg_clamp = clamp args := shiftbytesy_args.argptr[:] cu.LaunchKernel(shiftbytesy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftbytesy") } } // maps compute capability on PTX code for shiftbytesy kernel. var shiftbytesy_map = map[int]string{0: "", 50: shiftbytesy_ptx_50, 52: shiftbytesy_ptx_52, 53: shiftbytesy_ptx_53, 60: shiftbytesy_ptx_60, 61: shiftbytesy_ptx_61, 62: shiftbytesy_ptx_62, 70: shiftbytesy_ptx_70, 72: shiftbytesy_ptx_72, 75: shiftbytesy_ptx_75, 80: shiftbytesy_ptx_80, 86: shiftbytesy_ptx_86, 87: shiftbytesy_ptx_87, 89: shiftbytesy_ptx_89, 90: shiftbytesy_ptx_90} // shiftbytesy PTX code for various compute capabilities. const ( shiftbytesy_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` shiftbytesy_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u8 %rs4, [shiftbytesy_param_6]; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra $L__BB0_3; add.s32 %r19, %r4, %r5; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd3, %r20; cvta.to.global.u64 %rd4, %rd2; add.s64 %rd5, %rd4, %rd3; ld.global.nc.u8 %rs4, [%rd5]; $L__BB0_3: add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd6, %r22; cvta.to.global.u64 %rd7, %rd1; add.s64 %rd8, %rd7, %rd6; st.global.u8 [%rd8], %rs4; $L__BB0_4: ret; } ` ) 3-3.11.1/cuda/shiftedgecarryx.cu000066400000000000000000000033111503346766200164110ustar00rootroot00000000000000#include "stencil.h" // Shifts a component `src` of a vector field by `shx` cells along the X-axis. // Unlike the normal `shiftx()`, the new edge value is the current edge value. // // To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is // also required to pass the two other vector components `othercomp` and `anothercomp` to this function. // In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, // `clampL` or `clampR` is used for the component `src` instead. extern "C" __global__ void shiftedgecarryX(float* __restrict__ dst, float* __restrict__ src, float* __restrict__ othercomp, float* __restrict__ anothercomp, int Nx, int Ny, int Nz, int shx, float clampL, float clampR) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int ix2 = ix-shx; // old X-index float newval; if (ix2 < 0) { // left edge (shifting right) newval = src[idx(0, iy, iz)]; if (newval == 0 && othercomp[idx(0, iy, iz)] == 0 && anothercomp[idx(0, iy, iz)] == 0) { // If zero-vector newval = clampL; } } else if (ix2 >= Nx) { // right edge (shifting left) newval = src[idx(Nx-1, iy, iz)]; if (newval == 0 && othercomp[idx(Nx-1, iy, iz)] == 0 && anothercomp[idx(Nx-1, iy, iz)] == 0) { // If zero-vector newval = clampR; } } else { // bulk, doesn't matter which way the shift is newval = src[idx(ix2, iy, iz)]; } dst[idx(ix, iy, iz)] = newval; } } 3-3.11.1/cuda/shiftedgecarryx_wrapper.go000066400000000000000000001427711503346766200201650ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftedgecarryX kernel var shiftedgecarryX_code cu.Function // Stores the arguments for shiftedgecarryX kernel invocation type shiftedgecarryX_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_othercomp unsafe.Pointer arg_anothercomp unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shx int arg_clampL float32 arg_clampR float32 argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftedgecarryX kernel invocation var shiftedgecarryX_args shiftedgecarryX_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftedgecarryX_args.argptr[0] = unsafe.Pointer(&shiftedgecarryX_args.arg_dst) shiftedgecarryX_args.argptr[1] = unsafe.Pointer(&shiftedgecarryX_args.arg_src) shiftedgecarryX_args.argptr[2] = unsafe.Pointer(&shiftedgecarryX_args.arg_othercomp) shiftedgecarryX_args.argptr[3] = unsafe.Pointer(&shiftedgecarryX_args.arg_anothercomp) shiftedgecarryX_args.argptr[4] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nx) shiftedgecarryX_args.argptr[5] = unsafe.Pointer(&shiftedgecarryX_args.arg_Ny) shiftedgecarryX_args.argptr[6] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nz) shiftedgecarryX_args.argptr[7] = unsafe.Pointer(&shiftedgecarryX_args.arg_shx) shiftedgecarryX_args.argptr[8] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampL) shiftedgecarryX_args.argptr[9] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampR) } // Wrapper for shiftedgecarryX CUDA kernel, asynchronous. func k_shiftedgecarryX_async(dst unsafe.Pointer, src unsafe.Pointer, othercomp unsafe.Pointer, anothercomp unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftedgecarryX") } shiftedgecarryX_args.Lock() defer shiftedgecarryX_args.Unlock() if shiftedgecarryX_code == 0 { shiftedgecarryX_code = fatbinLoad(shiftedgecarryX_map, "shiftedgecarryX") } shiftedgecarryX_args.arg_dst = dst shiftedgecarryX_args.arg_src = src shiftedgecarryX_args.arg_othercomp = othercomp shiftedgecarryX_args.arg_anothercomp = anothercomp shiftedgecarryX_args.arg_Nx = Nx shiftedgecarryX_args.arg_Ny = Ny shiftedgecarryX_args.arg_Nz = Nz shiftedgecarryX_args.arg_shx = shx shiftedgecarryX_args.arg_clampL = clampL shiftedgecarryX_args.arg_clampR = clampR args := shiftedgecarryX_args.argptr[:] cu.LaunchKernel(shiftedgecarryX_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftedgecarryX") } } // maps compute capability on PTX code for shiftedgecarryX kernel. var shiftedgecarryX_map = map[int]string{0: "", 50: shiftedgecarryX_ptx_50, 52: shiftedgecarryX_ptx_52, 53: shiftedgecarryX_ptx_53, 60: shiftedgecarryX_ptx_60, 61: shiftedgecarryX_ptx_61, 62: shiftedgecarryX_ptx_62, 70: shiftedgecarryX_ptx_70, 72: shiftedgecarryX_ptx_72, 75: shiftedgecarryX_ptx_75, 80: shiftedgecarryX_ptx_80, 86: shiftedgecarryX_ptx_86, 87: shiftedgecarryX_ptx_87, 89: shiftedgecarryX_ptx_89, 90: shiftedgecarryX_ptx_90} // shiftedgecarryX PTX code for various compute capabilities. const ( shiftedgecarryX_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryX_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl shiftedgecarryX .visible .entry shiftedgecarryX( .param .u64 shiftedgecarryX_param_0, .param .u64 shiftedgecarryX_param_1, .param .u64 shiftedgecarryX_param_2, .param .u64 shiftedgecarryX_param_3, .param .u32 shiftedgecarryX_param_4, .param .u32 shiftedgecarryX_param_5, .param .u32 shiftedgecarryX_param_6, .param .u32 shiftedgecarryX_param_7, .param .f32 shiftedgecarryX_param_8, .param .f32 shiftedgecarryX_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<27>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryX_param_0]; ld.param.u64 %rd7, [shiftedgecarryX_param_1]; ld.param.u64 %rd8, [shiftedgecarryX_param_2]; ld.param.u64 %rd9, [shiftedgecarryX_param_3]; ld.param.u32 %r6, [shiftedgecarryX_param_4]; ld.param.u32 %r7, [shiftedgecarryX_param_5]; ld.param.u32 %r9, [shiftedgecarryX_param_6]; ld.param.u32 %r8, [shiftedgecarryX_param_7]; ld.param.f32 %f7, [shiftedgecarryX_param_8]; ld.param.f32 %f8, [shiftedgecarryX_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r6; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r5, %r4; mul.wide.s32 %rd16, %r22, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mad.lo.s32 %r23, %r3, %r7, %r2; mul.lo.s32 %r24, %r23, %r6; cvt.s64.s32 %rd5, %r24; mul.wide.s32 %rd18, %r24, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r20, %r6, %r5; add.s32 %r21, %r20, -1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r25, %r3, %r7, %r2; mad.lo.s32 %r26, %r25, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r26, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` ) 3-3.11.1/cuda/shiftedgecarryy.cu000066400000000000000000000033061503346766200164160ustar00rootroot00000000000000#include "stencil.h" // Shifts a component `src` of a vector field by `shy` cells along the Y-axis. // Unlike the normal `shifty()`, the new edge value is the current edge value. // // To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is // also required to pass the two other vector components `othercomp` and `anothercomp` to this function. // In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, // `clampD` or `clampU` is used for the component `src` instead. extern "C" __global__ void shiftedgecarryY(float* __restrict__ dst, float* __restrict__ src, float* __restrict__ othercomp, float* __restrict__ anothercomp, int Nx, int Ny, int Nz, int shy, float clampD, float clampU) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int iy2 = iy-shy; // old Y-index float newval; if (iy2 < 0) { // bottom edge (shifting up) newval = src[idx(ix, 0, iz)]; if (newval == 0 && othercomp[idx(ix, 0, iz)] == 0 && anothercomp[idx(ix, 0, iz)] == 0) { // If zero-vector newval = clampD; } } else if (iy2 >= Ny) { // top edge (shifting down) newval = src[idx(ix, Ny-1, iz)]; if (newval == 0 && othercomp[idx(ix, Ny-1, iz)] == 0 && anothercomp[idx(ix, Ny-1, iz)] == 0) { // If zero-vector newval = clampU; } } else { // bulk, doesn't matter which way the shift is newval = src[idx(ix, iy2, iz)]; } dst[idx(ix, iy, iz)] = newval; } } 3-3.11.1/cuda/shiftedgecarryy_wrapper.go000066400000000000000000001437431503346766200201660ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftedgecarryY kernel var shiftedgecarryY_code cu.Function // Stores the arguments for shiftedgecarryY kernel invocation type shiftedgecarryY_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_othercomp unsafe.Pointer arg_anothercomp unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shy int arg_clampD float32 arg_clampU float32 argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftedgecarryY kernel invocation var shiftedgecarryY_args shiftedgecarryY_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftedgecarryY_args.argptr[0] = unsafe.Pointer(&shiftedgecarryY_args.arg_dst) shiftedgecarryY_args.argptr[1] = unsafe.Pointer(&shiftedgecarryY_args.arg_src) shiftedgecarryY_args.argptr[2] = unsafe.Pointer(&shiftedgecarryY_args.arg_othercomp) shiftedgecarryY_args.argptr[3] = unsafe.Pointer(&shiftedgecarryY_args.arg_anothercomp) shiftedgecarryY_args.argptr[4] = unsafe.Pointer(&shiftedgecarryY_args.arg_Nx) shiftedgecarryY_args.argptr[5] = unsafe.Pointer(&shiftedgecarryY_args.arg_Ny) shiftedgecarryY_args.argptr[6] = unsafe.Pointer(&shiftedgecarryY_args.arg_Nz) shiftedgecarryY_args.argptr[7] = unsafe.Pointer(&shiftedgecarryY_args.arg_shy) shiftedgecarryY_args.argptr[8] = unsafe.Pointer(&shiftedgecarryY_args.arg_clampD) shiftedgecarryY_args.argptr[9] = unsafe.Pointer(&shiftedgecarryY_args.arg_clampU) } // Wrapper for shiftedgecarryY CUDA kernel, asynchronous. func k_shiftedgecarryY_async(dst unsafe.Pointer, src unsafe.Pointer, othercomp unsafe.Pointer, anothercomp unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clampD float32, clampU float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftedgecarryY") } shiftedgecarryY_args.Lock() defer shiftedgecarryY_args.Unlock() if shiftedgecarryY_code == 0 { shiftedgecarryY_code = fatbinLoad(shiftedgecarryY_map, "shiftedgecarryY") } shiftedgecarryY_args.arg_dst = dst shiftedgecarryY_args.arg_src = src shiftedgecarryY_args.arg_othercomp = othercomp shiftedgecarryY_args.arg_anothercomp = anothercomp shiftedgecarryY_args.arg_Nx = Nx shiftedgecarryY_args.arg_Ny = Ny shiftedgecarryY_args.arg_Nz = Nz shiftedgecarryY_args.arg_shy = shy shiftedgecarryY_args.arg_clampD = clampD shiftedgecarryY_args.arg_clampU = clampU args := shiftedgecarryY_args.argptr[:] cu.LaunchKernel(shiftedgecarryY_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftedgecarryY") } } // maps compute capability on PTX code for shiftedgecarryY kernel. var shiftedgecarryY_map = map[int]string{0: "", 50: shiftedgecarryY_ptx_50, 52: shiftedgecarryY_ptx_52, 53: shiftedgecarryY_ptx_53, 60: shiftedgecarryY_ptx_60, 61: shiftedgecarryY_ptx_61, 62: shiftedgecarryY_ptx_62, 70: shiftedgecarryY_ptx_70, 72: shiftedgecarryY_ptx_72, 75: shiftedgecarryY_ptx_75, 80: shiftedgecarryY_ptx_80, 86: shiftedgecarryY_ptx_86, 87: shiftedgecarryY_ptx_87, 89: shiftedgecarryY_ptx_89, 90: shiftedgecarryY_ptx_90} // shiftedgecarryY PTX code for various compute capabilities. const ( shiftedgecarryY_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` shiftedgecarryY_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl shiftedgecarryY .visible .entry shiftedgecarryY( .param .u64 shiftedgecarryY_param_0, .param .u64 shiftedgecarryY_param_1, .param .u64 shiftedgecarryY_param_2, .param .u64 shiftedgecarryY_param_3, .param .u32 shiftedgecarryY_param_4, .param .u32 shiftedgecarryY_param_5, .param .u32 shiftedgecarryY_param_6, .param .u32 shiftedgecarryY_param_7, .param .f32 shiftedgecarryY_param_8, .param .f32 shiftedgecarryY_param_9 ) { .reg .pred %p<14>; .reg .f32 %f<14>; .reg .b32 %r<28>; .reg .b64 %rd<27>; ld.param.u64 %rd6, [shiftedgecarryY_param_0]; ld.param.u64 %rd7, [shiftedgecarryY_param_1]; ld.param.u64 %rd8, [shiftedgecarryY_param_2]; ld.param.u64 %rd9, [shiftedgecarryY_param_3]; ld.param.u32 %r6, [shiftedgecarryY_param_4]; ld.param.u32 %r7, [shiftedgecarryY_param_5]; ld.param.u32 %r9, [shiftedgecarryY_param_6]; ld.param.u32 %r8, [shiftedgecarryY_param_7]; ld.param.f32 %f7, [shiftedgecarryY_param_8]; ld.param.f32 %f8, [shiftedgecarryY_param_9]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r14, %r13, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r17, %r16, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_11; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_7; setp.lt.s32 %p7, %r4, %r7; mul.lo.s32 %r5, %r3, %r7; @%p7 bra $L__BB0_6; bra.uni $L__BB0_3; $L__BB0_6: add.s32 %r22, %r4, %r5; mad.lo.s32 %r23, %r22, %r6, %r1; mul.wide.s32 %rd16, %r23, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f13, [%rd17]; bra.uni $L__BB0_10; $L__BB0_7: mul.lo.s32 %r24, %r3, %r6; mad.lo.s32 %r25, %r24, %r7, %r1; cvt.s64.s32 %rd5, %r25; mul.wide.s32 %rd18, %r25, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; setp.neu.f32 %p11, %f13, 0f00000000; @%p11 bra $L__BB0_10; shl.b64 %rd20, %rd5, 2; add.s64 %rd21, %rd2, %rd20; ld.global.nc.f32 %f11, [%rd21]; setp.neu.f32 %p12, %f11, 0f00000000; @%p12 bra $L__BB0_10; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f12, [%rd23]; setp.eq.f32 %p13, %f12, 0f00000000; selp.f32 %f13, %f7, %f13, %p13; bra.uni $L__BB0_10; $L__BB0_3: add.s32 %r19, %r7, %r5; add.s32 %r20, %r19, -1; mad.lo.s32 %r21, %r20, %r6, %r1; cvt.s64.s32 %rd4, %r21; mul.wide.s32 %rd10, %r21, 4; add.s64 %rd11, %rd3, %rd10; ld.global.nc.f32 %f13, [%rd11]; setp.neu.f32 %p8, %f13, 0f00000000; @%p8 bra $L__BB0_10; shl.b64 %rd12, %rd4, 2; add.s64 %rd13, %rd2, %rd12; ld.global.nc.f32 %f9, [%rd13]; setp.neu.f32 %p9, %f9, 0f00000000; @%p9 bra $L__BB0_10; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f10, [%rd15]; setp.eq.f32 %p10, %f10, 0f00000000; selp.f32 %f13, %f8, %f13, %p10; $L__BB0_10: mad.lo.s32 %r26, %r3, %r7, %r2; mad.lo.s32 %r27, %r26, %r6, %r1; cvta.to.global.u64 %rd24, %rd6; mul.wide.s32 %rd25, %r27, 4; add.s64 %rd26, %rd24, %rd25; st.global.f32 [%rd26], %f13; $L__BB0_11: ret; } ` ) 3-3.11.1/cuda/shiftx.cu000066400000000000000000000014351503346766200145300ustar00rootroot00000000000000#include "stencil.h" // shift dst by shx cells (positive or negative) along X-axis. // new edge value is clampL at left edge (-X) or clampR at right edge (+X). extern "C" __global__ void shiftx(float* __restrict__ dst, float* __restrict__ src, int Nx, int Ny, int Nz, int shx, float clampL, float clampR) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int ix2 = ix-shx; float newval; if (ix2 < 0) { newval = clampL; } else if (ix2 >= Nx) { newval = clampR; } else { newval = src[idx(ix2, iy, iz)]; } dst[idx(ix, iy, iz)] = newval; } } 3-3.11.1/cuda/shiftx_wrapper.go000066400000000000000000000655011503346766200162720ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftx kernel var shiftx_code cu.Function // Stores the arguments for shiftx kernel invocation type shiftx_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shx int arg_clampL float32 arg_clampR float32 argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftx kernel invocation var shiftx_args shiftx_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftx_args.argptr[0] = unsafe.Pointer(&shiftx_args.arg_dst) shiftx_args.argptr[1] = unsafe.Pointer(&shiftx_args.arg_src) shiftx_args.argptr[2] = unsafe.Pointer(&shiftx_args.arg_Nx) shiftx_args.argptr[3] = unsafe.Pointer(&shiftx_args.arg_Ny) shiftx_args.argptr[4] = unsafe.Pointer(&shiftx_args.arg_Nz) shiftx_args.argptr[5] = unsafe.Pointer(&shiftx_args.arg_shx) shiftx_args.argptr[6] = unsafe.Pointer(&shiftx_args.arg_clampL) shiftx_args.argptr[7] = unsafe.Pointer(&shiftx_args.arg_clampR) } // Wrapper for shiftx CUDA kernel, asynchronous. func k_shiftx_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftx") } shiftx_args.Lock() defer shiftx_args.Unlock() if shiftx_code == 0 { shiftx_code = fatbinLoad(shiftx_map, "shiftx") } shiftx_args.arg_dst = dst shiftx_args.arg_src = src shiftx_args.arg_Nx = Nx shiftx_args.arg_Ny = Ny shiftx_args.arg_Nz = Nz shiftx_args.arg_shx = shx shiftx_args.arg_clampL = clampL shiftx_args.arg_clampR = clampR args := shiftx_args.argptr[:] cu.LaunchKernel(shiftx_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftx") } } // maps compute capability on PTX code for shiftx kernel. var shiftx_map = map[int]string{0: "", 50: shiftx_ptx_50, 52: shiftx_ptx_52, 53: shiftx_ptx_53, 60: shiftx_ptx_60, 61: shiftx_ptx_61, 62: shiftx_ptx_62, 70: shiftx_ptx_70, 72: shiftx_ptx_72, 75: shiftx_ptx_75, 80: shiftx_ptx_80, 86: shiftx_ptx_86, 87: shiftx_ptx_87, 89: shiftx_ptx_89, 90: shiftx_ptx_90} // shiftx PTX code for various compute capabilities. const ( shiftx_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftx_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` ) 3-3.11.1/cuda/shifty.cu000066400000000000000000000014351503346766200145310ustar00rootroot00000000000000#include "stencil.h" // shift dst by shy cells (positive or negative) along Y-axis. // new edge value is clampD at bottom edge (-Y) or clampU at top edge (+Y). extern "C" __global__ void shifty(float* __restrict__ dst, float* __restrict__ src, int Nx, int Ny, int Nz, int shy, float clampD, float clampU) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int iy2 = iy-shy; float newval; if (iy2 < 0) { newval = clampD; } else if (iy2 >= Ny) { newval = clampU; } else { newval = src[idx(ix, iy2, iz)]; } dst[idx(ix, iy, iz)] = newval; } } 3-3.11.1/cuda/shifty_wrapper.go000066400000000000000000000655011503346766200162730ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shifty kernel var shifty_code cu.Function // Stores the arguments for shifty kernel invocation type shifty_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shy int arg_clampD float32 arg_clampU float32 argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shifty kernel invocation var shifty_args shifty_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shifty_args.argptr[0] = unsafe.Pointer(&shifty_args.arg_dst) shifty_args.argptr[1] = unsafe.Pointer(&shifty_args.arg_src) shifty_args.argptr[2] = unsafe.Pointer(&shifty_args.arg_Nx) shifty_args.argptr[3] = unsafe.Pointer(&shifty_args.arg_Ny) shifty_args.argptr[4] = unsafe.Pointer(&shifty_args.arg_Nz) shifty_args.argptr[5] = unsafe.Pointer(&shifty_args.arg_shy) shifty_args.argptr[6] = unsafe.Pointer(&shifty_args.arg_clampD) shifty_args.argptr[7] = unsafe.Pointer(&shifty_args.arg_clampU) } // Wrapper for shifty CUDA kernel, asynchronous. func k_shifty_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clampD float32, clampU float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shifty") } shifty_args.Lock() defer shifty_args.Unlock() if shifty_code == 0 { shifty_code = fatbinLoad(shifty_map, "shifty") } shifty_args.arg_dst = dst shifty_args.arg_src = src shifty_args.arg_Nx = Nx shifty_args.arg_Ny = Ny shifty_args.arg_Nz = Nz shifty_args.arg_shy = shy shifty_args.arg_clampD = clampD shifty_args.arg_clampU = clampU args := shifty_args.argptr[:] cu.LaunchKernel(shifty_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shifty") } } // maps compute capability on PTX code for shifty kernel. var shifty_map = map[int]string{0: "", 50: shifty_ptx_50, 52: shifty_ptx_52, 53: shifty_ptx_53, 60: shifty_ptx_60, 61: shifty_ptx_61, 62: shifty_ptx_62, 70: shifty_ptx_70, 72: shifty_ptx_72, 75: shifty_ptx_75, 80: shifty_ptx_80, 86: shifty_ptx_86, 87: shifty_ptx_87, 89: shifty_ptx_89, 90: shifty_ptx_90} // shifty PTX code for various compute capabilities. const ( shifty_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shifty_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` ) 3-3.11.1/cuda/shiftz.cu000066400000000000000000000014351503346766200145320ustar00rootroot00000000000000#include "stencil.h" // shift dst by shz cells (positive or negative) along Z-axis. // new edge value is clampB at back edge (-Z) or clampF at front edge (+Z). extern "C" __global__ void shiftz(float* __restrict__ dst, float* __restrict__ src, int Nx, int Ny, int Nz, int shz, float clampB, float clampF) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int iz2 = iz-shz; float newval; if (iz2 < 0) { newval = clampB; } else if (iz2 >= Nz) { newval = clampF; } else { newval = src[idx(ix, iy, iz2)]; } dst[idx(ix, iy, iz)] = newval; } } 3-3.11.1/cuda/shiftz_wrapper.go000066400000000000000000000655011503346766200162740ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftz kernel var shiftz_code cu.Function // Stores the arguments for shiftz kernel invocation type shiftz_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shz int arg_clampB float32 arg_clampF float32 argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftz kernel invocation var shiftz_args shiftz_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftz_args.argptr[0] = unsafe.Pointer(&shiftz_args.arg_dst) shiftz_args.argptr[1] = unsafe.Pointer(&shiftz_args.arg_src) shiftz_args.argptr[2] = unsafe.Pointer(&shiftz_args.arg_Nx) shiftz_args.argptr[3] = unsafe.Pointer(&shiftz_args.arg_Ny) shiftz_args.argptr[4] = unsafe.Pointer(&shiftz_args.arg_Nz) shiftz_args.argptr[5] = unsafe.Pointer(&shiftz_args.arg_shz) shiftz_args.argptr[6] = unsafe.Pointer(&shiftz_args.arg_clampB) shiftz_args.argptr[7] = unsafe.Pointer(&shiftz_args.arg_clampF) } // Wrapper for shiftz CUDA kernel, asynchronous. func k_shiftz_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shz int, clampB float32, clampF float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftz") } shiftz_args.Lock() defer shiftz_args.Unlock() if shiftz_code == 0 { shiftz_code = fatbinLoad(shiftz_map, "shiftz") } shiftz_args.arg_dst = dst shiftz_args.arg_src = src shiftz_args.arg_Nx = Nx shiftz_args.arg_Ny = Ny shiftz_args.arg_Nz = Nz shiftz_args.arg_shz = shz shiftz_args.arg_clampB = clampB shiftz_args.arg_clampF = clampF args := shiftz_args.argptr[:] cu.LaunchKernel(shiftz_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftz") } } // maps compute capability on PTX code for shiftz kernel. var shiftz_map = map[int]string{0: "", 50: shiftz_ptx_50, 52: shiftz_ptx_52, 53: shiftz_ptx_53, 60: shiftz_ptx_60, 61: shiftz_ptx_61, 62: shiftz_ptx_62, 70: shiftz_ptx_70, 72: shiftz_ptx_72, 75: shiftz_ptx_75, 80: shiftz_ptx_80, 86: shiftz_ptx_86, 87: shiftz_ptx_87, 89: shiftz_ptx_89, 90: shiftz_ptx_90} // shiftz PTX code for various compute capabilities. const ( shiftz_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` shiftz_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r13, %r12, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r16, %r15, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra $L__BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra $L__BB0_4; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; $L__BB0_4: mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; $L__BB0_5: ret; } ` ) 3-3.11.1/cuda/slice.go000066400000000000000000000056721503346766200143270ustar00rootroot00000000000000package cuda import ( "math" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/timer" "github.com/mumax/3/util" ) // Make a GPU Slice with nComp components each of size length. func NewSlice(nComp int, size [3]int) *data.Slice { return newSlice(nComp, size, MemAlloc, data.GPUMemory) } // Make a GPU Slice with nComp components each of size length. //func NewUnifiedSlice(nComp int, m *data.Mesh) *data.Slice { // return newSlice(nComp, m, cu.MemAllocHost, data.UnifiedMemory) //} func newSlice(nComp int, size [3]int, alloc func(int64) unsafe.Pointer, memType int8) *data.Slice { data.EnableGPU(memFree, cu.MemFreeHost, MemCpy, MemCpyDtoH, MemCpyHtoD) length := prod(size) bytes := int64(length) * cu.SIZEOF_FLOAT32 ptrs := make([]unsafe.Pointer, nComp) for c := range ptrs { ptrs[c] = unsafe.Pointer(alloc(bytes)) cu.MemsetD32(cu.DevicePtr(uintptr(ptrs[c])), 0, int64(length)) } return data.SliceFromPtrs(size, memType, ptrs) } // wrappers for data.EnableGPU arguments func memFree(ptr unsafe.Pointer) { cu.MemFree(cu.DevicePtr(uintptr(ptr))) } func MemCpyDtoH(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyDtoH") cu.MemcpyDtoH(dst, cu.DevicePtr(uintptr(src)), bytes) Sync() // sync copy timer.Stop("memcpyDtoH") } func MemCpyHtoD(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyHtoD") cu.MemcpyHtoD(cu.DevicePtr(uintptr(dst)), src, bytes) Sync() // sync copy timer.Stop("memcpyHtoD") } func MemCpy(dst, src unsafe.Pointer, bytes int64) { Sync() timer.Start("memcpy") cu.MemcpyAsync(cu.DevicePtr(uintptr(dst)), cu.DevicePtr(uintptr(src)), bytes, stream0) Sync() timer.Stop("memcpy") } // Memset sets the Slice's components to the specified values. // To be carefully used on unified slice (need sync) func Memset(s *data.Slice, val ...float32) { if Synchronous { // debug Sync() timer.Start("memset") } util.Argument(len(val) == s.NComp()) for c, v := range val { cu.MemsetD32Async(cu.DevicePtr(uintptr(s.DevPtr(c))), math.Float32bits(v), int64(s.Len()), stream0) } if Synchronous { //debug Sync() timer.Stop("memset") } } // Set all elements of all components to zero. func Zero(s *data.Slice) { Memset(s, make([]float32, s.NComp())...) } func SetCell(s *data.Slice, comp int, ix, iy, iz int, value float32) { SetElem(s, comp, s.Index(ix, iy, iz), value) } func SetElem(s *data.Slice, comp int, index int, value float32) { f := value dst := unsafe.Pointer(uintptr(s.DevPtr(comp)) + uintptr(index)*cu.SIZEOF_FLOAT32) MemCpyHtoD(dst, unsafe.Pointer(&f), cu.SIZEOF_FLOAT32) } func GetElem(s *data.Slice, comp int, index int) float32 { var f float32 src := unsafe.Pointer(uintptr(s.DevPtr(comp)) + uintptr(index)*cu.SIZEOF_FLOAT32) MemCpyDtoH(unsafe.Pointer(&f), src, cu.SIZEOF_FLOAT32) return f } func GetCell(s *data.Slice, comp, ix, iy, iz int) float32 { return GetElem(s, comp, s.Index(ix, iy, iz)) } 3-3.11.1/cuda/slice_test.go000066400000000000000000000031751503346766200153620ustar00rootroot00000000000000package cuda import ( "testing" "github.com/mumax/3/data" ) func TestSlice(t *testing.T) { N0, N1, N2 := 2, 4, 8 m := [3]int{N0, N1, N2} N := N0 * N1 * N2 a := NewSlice(3, m) defer a.Free() Memset(a, 1, 2, 3) if a.GPUAccess() == false { t.Fail() } if a.Len() != N { t.Fail() } if a.NComp() != 3 { t.Fail() } b := a.Comp(1) if b.GPUAccess() == false { t.Error("b.GPUAccess", b.GPUAccess()) } if b.Len() != N { t.Error("b.Len", b.Len()) } if b.NComp() != 1 { t.Error("b.NComp", b.NComp()) } if b.Size() != a.Size() { t.Fail() } } func TestCpy(t *testing.T) { N0, N1, N2 := 2, 4, 32 N := N0 * N1 * N2 mesh := [3]int{N0, N1, N2} h1 := make([]float32, N) for i := range h1 { h1[i] = float32(i) } hs := sliceFromList([][]float32{h1}, mesh) d := NewSlice(1, mesh) data.Copy(d, hs) d2 := NewSlice(1, mesh) data.Copy(d2, d) h2 := data.NewSlice(1, mesh) data.Copy(h2, d2) res := h2.Host()[0] for i := range res { if res[i] != h1[i] { t.Fail() } } } func TestSliceFree(t *testing.T) { N0, N1, N2 := 128, 1024, 1024 m := [3]int{N0, N1, N2} N := 17 // not freeing would attempt to allocate 17GB. for i := 0; i < N; i++ { a := NewSlice(2, m) a.Free() } a := NewSlice(2, m) a.Free() a.Free() // test double-free } func TestSliceHost(t *testing.T) { N0, N1, N2 := 1, 10, 10 m := [3]int{N0, N1, N2} a := NewSlice(3, m) defer a.Free() b := a.HostCopy().Host() if b[0][0] != 0 || b[1][42] != 0 || b[2][99] != 0 { t.Error("slice not inited to zero") } Memset(a, 1, 2, 3) b = a.HostCopy().Host() if b[0][0] != 1 || b[1][42] != 2 || b[2][99] != 3 { t.Error("slice memset") } } 3-3.11.1/cuda/slonczewski.go000066400000000000000000000015431503346766200155740ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // Add Slonczewski ST torque to torque (Tesla). // see slonczewski2.cu func AddSlonczewskiTorque2(torque, m *data.Slice, Msat, J, fixedP, alpha, pol, λ, ε_prime MSlice, thickness MSlice, flp float64, mesh *data.Mesh) { N := torque.Len() cfg := make1DConf(N) meshThickness := mesh.WorldSize()[Z] k_addslonczewskitorque2_async( torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), J.DevPtr(Z), J.Mul(Z), fixedP.DevPtr(X), fixedP.Mul(X), fixedP.DevPtr(Y), fixedP.Mul(Y), fixedP.DevPtr(Z), fixedP.Mul(Z), alpha.DevPtr(0), alpha.Mul(0), pol.DevPtr(0), pol.Mul(0), λ.DevPtr(0), λ.Mul(0), ε_prime.DevPtr(0), ε_prime.Mul(0), thickness.DevPtr(0), thickness.Mul(0), float32(meshThickness), float32(flp), N, cfg) } 3-3.11.1/cuda/slonczewski2.cu000066400000000000000000000054341503346766200156630ustar00rootroot00000000000000// Original implementation by Mykola Dvornik for mumax2 // Modified for mumax3 by Arne Vansteenkiste, 2013, 2016 #include #include "float3.h" #include "constants.h" #include "amul.h" extern "C" __global__ void addslonczewskitorque2(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ jz_, float jz_mul, float* __restrict__ px_, float px_mul, float* __restrict__ py_, float py_mul, float* __restrict__ pz_, float pz_mul, float* __restrict__ alpha_, float alpha_mul, float* __restrict__ pol_, float pol_mul, float* __restrict__ lambda_, float lambda_mul, float* __restrict__ epsPrime_, float epsPrime_mul, float* __restrict__ thickness_, float thickness_mul, float meshThickness, float freeLayerPosition, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m = make_float3(mx[i], my[i], mz[i]); float J = amul(jz_, jz_mul, i); float3 p = normalized(vmul(px_, py_, pz_, px_mul, py_mul, pz_mul, i)); float Ms = amul(Ms_, Ms_mul, i); float alpha = amul(alpha_, alpha_mul, i); float pol = amul(pol_, pol_mul, i); float lambda = amul(lambda_, lambda_mul, i); float epsilonPrime = amul(epsPrime_, epsPrime_mul, i); float thickness = amul(thickness_, thickness_mul, i); if (thickness == 0.0) { // if thickness is not set, use the thickness of the mesh instead thickness = meshThickness; } thickness *= freeLayerPosition; // switch sign if fixedlayer is at the bottom if (J == 0.0f || Ms == 0.0f) { return; } float beta = (HBAR / QE) * (J / (thickness*Ms) ); float lambda2 = lambda * lambda; float epsilon = pol * lambda2 / ((lambda2 + 1.0f) + (lambda2 - 1.0f) * dot(p, m)); float A = beta * epsilon; float B = beta * epsilonPrime; float gilb = 1.0f / (1.0f + alpha * alpha); float mxpxmFac = gilb * (A + alpha * B); float pxmFac = gilb * (B - alpha * A); float3 pxm = cross(p, m); float3 mxpxm = cross(m, pxm); tx[i] += mxpxmFac * mxpxm.x + pxmFac * pxm.x; ty[i] += mxpxmFac * mxpxm.y + pxmFac * pxm.y; tz[i] += mxpxmFac * mxpxm.z + pxmFac * pxm.z; } } 3-3.11.1/cuda/slonczewski2_wrapper.go000066400000000000000000003674621503346766200174350ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addslonczewskitorque2 kernel var addslonczewskitorque2_code cu.Function // Stores the arguments for addslonczewskitorque2 kernel invocation type addslonczewskitorque2_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_jz_ unsafe.Pointer arg_jz_mul float32 arg_px_ unsafe.Pointer arg_px_mul float32 arg_py_ unsafe.Pointer arg_py_mul float32 arg_pz_ unsafe.Pointer arg_pz_mul float32 arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_pol_ unsafe.Pointer arg_pol_mul float32 arg_lambda_ unsafe.Pointer arg_lambda_mul float32 arg_epsPrime_ unsafe.Pointer arg_epsPrime_mul float32 arg_thickness_ unsafe.Pointer arg_thickness_mul float32 arg_meshThickness float32 arg_freeLayerPosition float32 arg_N int argptr [29]unsafe.Pointer sync.Mutex } // Stores the arguments for addslonczewskitorque2 kernel invocation var addslonczewskitorque2_args addslonczewskitorque2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addslonczewskitorque2_args.argptr[0] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tx) addslonczewskitorque2_args.argptr[1] = unsafe.Pointer(&addslonczewskitorque2_args.arg_ty) addslonczewskitorque2_args.argptr[2] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tz) addslonczewskitorque2_args.argptr[3] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mx) addslonczewskitorque2_args.argptr[4] = unsafe.Pointer(&addslonczewskitorque2_args.arg_my) addslonczewskitorque2_args.argptr[5] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mz) addslonczewskitorque2_args.argptr[6] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_) addslonczewskitorque2_args.argptr[7] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_mul) addslonczewskitorque2_args.argptr[8] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_) addslonczewskitorque2_args.argptr[9] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_mul) addslonczewskitorque2_args.argptr[10] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_) addslonczewskitorque2_args.argptr[11] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_mul) addslonczewskitorque2_args.argptr[12] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_) addslonczewskitorque2_args.argptr[13] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_mul) addslonczewskitorque2_args.argptr[14] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_) addslonczewskitorque2_args.argptr[15] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_mul) addslonczewskitorque2_args.argptr[16] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_) addslonczewskitorque2_args.argptr[17] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_mul) addslonczewskitorque2_args.argptr[18] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_) addslonczewskitorque2_args.argptr[19] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_mul) addslonczewskitorque2_args.argptr[20] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_) addslonczewskitorque2_args.argptr[21] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_mul) addslonczewskitorque2_args.argptr[22] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_) addslonczewskitorque2_args.argptr[23] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_mul) addslonczewskitorque2_args.argptr[24] = unsafe.Pointer(&addslonczewskitorque2_args.arg_thickness_) addslonczewskitorque2_args.argptr[25] = unsafe.Pointer(&addslonczewskitorque2_args.arg_thickness_mul) addslonczewskitorque2_args.argptr[26] = unsafe.Pointer(&addslonczewskitorque2_args.arg_meshThickness) addslonczewskitorque2_args.argptr[27] = unsafe.Pointer(&addslonczewskitorque2_args.arg_freeLayerPosition) addslonczewskitorque2_args.argptr[28] = unsafe.Pointer(&addslonczewskitorque2_args.arg_N) } // Wrapper for addslonczewskitorque2 CUDA kernel, asynchronous. func k_addslonczewskitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jz_ unsafe.Pointer, jz_mul float32, px_ unsafe.Pointer, px_mul float32, py_ unsafe.Pointer, py_mul float32, pz_ unsafe.Pointer, pz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, pol_ unsafe.Pointer, pol_mul float32, lambda_ unsafe.Pointer, lambda_mul float32, epsPrime_ unsafe.Pointer, epsPrime_mul float32, thickness_ unsafe.Pointer, thickness_mul float32, meshThickness float32, freeLayerPosition float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("addslonczewskitorque2") } addslonczewskitorque2_args.Lock() defer addslonczewskitorque2_args.Unlock() if addslonczewskitorque2_code == 0 { addslonczewskitorque2_code = fatbinLoad(addslonczewskitorque2_map, "addslonczewskitorque2") } addslonczewskitorque2_args.arg_tx = tx addslonczewskitorque2_args.arg_ty = ty addslonczewskitorque2_args.arg_tz = tz addslonczewskitorque2_args.arg_mx = mx addslonczewskitorque2_args.arg_my = my addslonczewskitorque2_args.arg_mz = mz addslonczewskitorque2_args.arg_Ms_ = Ms_ addslonczewskitorque2_args.arg_Ms_mul = Ms_mul addslonczewskitorque2_args.arg_jz_ = jz_ addslonczewskitorque2_args.arg_jz_mul = jz_mul addslonczewskitorque2_args.arg_px_ = px_ addslonczewskitorque2_args.arg_px_mul = px_mul addslonczewskitorque2_args.arg_py_ = py_ addslonczewskitorque2_args.arg_py_mul = py_mul addslonczewskitorque2_args.arg_pz_ = pz_ addslonczewskitorque2_args.arg_pz_mul = pz_mul addslonczewskitorque2_args.arg_alpha_ = alpha_ addslonczewskitorque2_args.arg_alpha_mul = alpha_mul addslonczewskitorque2_args.arg_pol_ = pol_ addslonczewskitorque2_args.arg_pol_mul = pol_mul addslonczewskitorque2_args.arg_lambda_ = lambda_ addslonczewskitorque2_args.arg_lambda_mul = lambda_mul addslonczewskitorque2_args.arg_epsPrime_ = epsPrime_ addslonczewskitorque2_args.arg_epsPrime_mul = epsPrime_mul addslonczewskitorque2_args.arg_thickness_ = thickness_ addslonczewskitorque2_args.arg_thickness_mul = thickness_mul addslonczewskitorque2_args.arg_meshThickness = meshThickness addslonczewskitorque2_args.arg_freeLayerPosition = freeLayerPosition addslonczewskitorque2_args.arg_N = N args := addslonczewskitorque2_args.argptr[:] cu.LaunchKernel(addslonczewskitorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addslonczewskitorque2") } } // maps compute capability on PTX code for addslonczewskitorque2 kernel. var addslonczewskitorque2_map = map[int]string{0: "", 50: addslonczewskitorque2_ptx_50, 52: addslonczewskitorque2_ptx_52, 53: addslonczewskitorque2_ptx_53, 60: addslonczewskitorque2_ptx_60, 61: addslonczewskitorque2_ptx_61, 62: addslonczewskitorque2_ptx_62, 70: addslonczewskitorque2_ptx_70, 72: addslonczewskitorque2_ptx_72, 75: addslonczewskitorque2_ptx_75, 80: addslonczewskitorque2_ptx_80, 86: addslonczewskitorque2_ptx_86, 87: addslonczewskitorque2_ptx_87, 89: addslonczewskitorque2_ptx_89, 90: addslonczewskitorque2_ptx_90} // addslonczewskitorque2 PTX code for various compute capabilities. const ( addslonczewskitorque2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` addslonczewskitorque2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<9>; .reg .f64 %fd<3>; .reg .b64 %rd<62>; ld.param.u64 %rd2, [addslonczewskitorque2_param_0]; ld.param.u64 %rd3, [addslonczewskitorque2_param_1]; ld.param.u64 %rd4, [addslonczewskitorque2_param_2]; ld.param.u64 %rd5, [addslonczewskitorque2_param_3]; ld.param.u64 %rd6, [addslonczewskitorque2_param_4]; ld.param.u64 %rd7, [addslonczewskitorque2_param_5]; ld.param.u64 %rd8, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd9, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd10, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd11, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd12, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd13, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd14, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd15, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd16, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd17, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_25; cvta.to.global.u64 %rd18, %rd5; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.nc.f32 %f1, [%rd20]; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd19; ld.global.nc.f32 %f2, [%rd22]; cvta.to.global.u64 %rd23, %rd7; add.s64 %rd24, %rd23, %rd19; ld.global.nc.f32 %f3, [%rd24]; setp.eq.s64 %p2, %rd9, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd25, %rd9; shl.b64 %rd26, %rd1, 2; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f42, [%rd27]; mul.f32 %f109, %f42, %f109; $L__BB0_3: setp.eq.s64 %p3, %rd10, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd28, %rd10; shl.b64 %rd29, %rd1, 2; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f43, [%rd30]; mul.f32 %f110, %f43, %f110; $L__BB0_5: setp.eq.s64 %p4, %rd11, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd31, %rd11; shl.b64 %rd32, %rd1, 2; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f44, [%rd33]; mul.f32 %f111, %f44, %f111; $L__BB0_7: setp.eq.s64 %p5, %rd12, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd34, %rd12; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f45, [%rd36]; mul.f32 %f112, %f45, %f112; $L__BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; setp.eq.f32 %p6, %f12, 0f00000000; mov.f32 %f113, 0f00000000; @%p6 bra $L__BB0_11; rcp.rn.f32 %f113, %f12; $L__BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd8, 0; @%p7 bra $L__BB0_13; cvta.to.global.u64 %rd37, %rd8; shl.b64 %rd38, %rd1, 2; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f50, [%rd39]; mul.f32 %f114, %f50, %f114; $L__BB0_13: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd40, %rd13; shl.b64 %rd41, %rd1, 2; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f51, [%rd42]; mul.f32 %f115, %f51, %f115; $L__BB0_15: setp.eq.s64 %p9, %rd14, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd43, %rd14; shl.b64 %rd44, %rd1, 2; add.s64 %rd45, %rd43, %rd44; ld.global.nc.f32 %f52, [%rd45]; mul.f32 %f116, %f52, %f116; $L__BB0_17: setp.eq.s64 %p10, %rd15, 0; @%p10 bra $L__BB0_19; cvta.to.global.u64 %rd46, %rd15; shl.b64 %rd47, %rd1, 2; add.s64 %rd48, %rd46, %rd47; ld.global.nc.f32 %f53, [%rd48]; mul.f32 %f117, %f53, %f117; $L__BB0_19: setp.eq.s64 %p11, %rd16, 0; @%p11 bra $L__BB0_21; cvta.to.global.u64 %rd49, %rd16; shl.b64 %rd50, %rd1, 2; add.s64 %rd51, %rd49, %rd50; ld.global.nc.f32 %f54, [%rd51]; mul.f32 %f118, %f54, %f118; $L__BB0_21: setp.eq.s64 %p12, %rd17, 0; @%p12 bra $L__BB0_23; cvta.to.global.u64 %rd52, %rd17; shl.b64 %rd53, %rd1, 2; add.s64 %rd54, %rd52, %rd53; ld.global.nc.f32 %f55, [%rd54]; mul.f32 %f119, %f55, %f119; $L__BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra $L__BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd55, %rd2; shl.b64 %rd56, %rd1, 2; add.s64 %rd57, %rd55, %rd56; ld.global.f32 %f99, [%rd57]; add.f32 %f100, %f98, %f99; st.global.f32 [%rd57], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd56; ld.global.f32 %f103, [%rd59]; add.f32 %f104, %f102, %f103; st.global.f32 [%rd59], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd60, %rd4; add.s64 %rd61, %rd60, %rd56; ld.global.f32 %f107, [%rd61]; add.f32 %f108, %f106, %f107; st.global.f32 [%rd61], %f108; $L__BB0_25: ret; } ` ) 3-3.11.1/cuda/stencil.h000066400000000000000000000016131503346766200145020ustar00rootroot00000000000000#ifndef _STENCIL_H_ #define _STENCIL_H_ // 3D array indexing #define index(ix,iy,iz,Nx,Ny,Nz) ( ( (iz)*(Ny) + (iy) ) * (Nx) + (ix) ) #define idx(ix,iy,iz) ( index((ix),(iy),(iz),(Nx),(Ny),(Nz)) ) // modulo used for PBC wrap around #define MOD(n, M) ( (( (n) % (M) ) + (M) ) % (M) ) // have PBC in x, y or z? #define PBCx (PBC & 1) #define PBCy (PBC & 2) #define PBCz (PBC & 4) // clamp or wrap index at boundary, depending on PBC // hclamp*: clamps on upper side (index+1) // lclamp*: clamps on lower side (index-1) // *clampx: clamps along x // ... #define hclampx(ix) (PBCx? MOD(ix, Nx) : min((ix), Nx-1)) #define lclampx(ix) (PBCx? MOD(ix, Nx) : max((ix), 0)) #define hclampy(iy) (PBCy? MOD(iy, Ny) : min((iy), Ny-1)) #define lclampy(iy) (PBCy? MOD(iy, Ny) : max((iy), 0)) #define hclampz(iz) (PBCz? MOD(iz, Nz) : min((iz), Nz-1)) #define lclampz(iz) (PBCz? MOD(iz, Nz) : max((iz), 0)) #endif 3-3.11.1/cuda/sum.h000066400000000000000000000001521503346766200136420ustar00rootroot00000000000000#ifndef _SUM_H_ #define _SUM_H_ inline __device__ float sum(float a, float b){ return a + b; } #endif 3-3.11.1/cuda/temperature.go000066400000000000000000000010161503346766200155510ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Set Bth to thermal noise (Brown). // see temperature2.cu func SetTemperature(Bth, noise *data.Slice, k2mu0_Mu0VgammaDt float64, Msat, Temp, Alpha MSlice) { util.Argument(Bth.NComp() == 1 && noise.NComp() == 1) N := Bth.Len() cfg := make1DConf(N) k_settemperature2_async(Bth.DevPtr(0), noise.DevPtr(0), float32(k2mu0_Mu0VgammaDt), Msat.DevPtr(0), Msat.Mul(0), Temp.DevPtr(0), Temp.Mul(0), Alpha.DevPtr(0), Alpha.Mul(0), N, cfg) } 3-3.11.1/cuda/temperature2.cu000066400000000000000000000013621503346766200156410ustar00rootroot00000000000000#include #include "amul.h" // TODO: this could act on x,y,z, so that we need to call it only once. extern "C" __global__ void settemperature2(float* __restrict__ B, float* __restrict__ noise, float kB2_VgammaDt, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ temp_, float temp_mul, float* __restrict__ alpha_, float alpha_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float invMs = inv_Msat(Ms_, Ms_mul, i); float temp = amul(temp_, temp_mul, i); float alpha = amul(alpha_, alpha_mul, i); B[i] = noise[i] * sqrtf((kB2_VgammaDt * alpha * temp * invMs )); } } 3-3.11.1/cuda/temperature2_wrapper.go000066400000000000000000001126061503346766200174030ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for settemperature2 kernel var settemperature2_code cu.Function // Stores the arguments for settemperature2 kernel invocation type settemperature2_args_t struct { arg_B unsafe.Pointer arg_noise unsafe.Pointer arg_kB2_VgammaDt float32 arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_temp_ unsafe.Pointer arg_temp_mul float32 arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for settemperature2 kernel invocation var settemperature2_args settemperature2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. settemperature2_args.argptr[0] = unsafe.Pointer(&settemperature2_args.arg_B) settemperature2_args.argptr[1] = unsafe.Pointer(&settemperature2_args.arg_noise) settemperature2_args.argptr[2] = unsafe.Pointer(&settemperature2_args.arg_kB2_VgammaDt) settemperature2_args.argptr[3] = unsafe.Pointer(&settemperature2_args.arg_Ms_) settemperature2_args.argptr[4] = unsafe.Pointer(&settemperature2_args.arg_Ms_mul) settemperature2_args.argptr[5] = unsafe.Pointer(&settemperature2_args.arg_temp_) settemperature2_args.argptr[6] = unsafe.Pointer(&settemperature2_args.arg_temp_mul) settemperature2_args.argptr[7] = unsafe.Pointer(&settemperature2_args.arg_alpha_) settemperature2_args.argptr[8] = unsafe.Pointer(&settemperature2_args.arg_alpha_mul) settemperature2_args.argptr[9] = unsafe.Pointer(&settemperature2_args.arg_N) } // Wrapper for settemperature2 CUDA kernel, asynchronous. func k_settemperature2_async(B unsafe.Pointer, noise unsafe.Pointer, kB2_VgammaDt float32, Ms_ unsafe.Pointer, Ms_mul float32, temp_ unsafe.Pointer, temp_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("settemperature2") } settemperature2_args.Lock() defer settemperature2_args.Unlock() if settemperature2_code == 0 { settemperature2_code = fatbinLoad(settemperature2_map, "settemperature2") } settemperature2_args.arg_B = B settemperature2_args.arg_noise = noise settemperature2_args.arg_kB2_VgammaDt = kB2_VgammaDt settemperature2_args.arg_Ms_ = Ms_ settemperature2_args.arg_Ms_mul = Ms_mul settemperature2_args.arg_temp_ = temp_ settemperature2_args.arg_temp_mul = temp_mul settemperature2_args.arg_alpha_ = alpha_ settemperature2_args.arg_alpha_mul = alpha_mul settemperature2_args.arg_N = N args := settemperature2_args.argptr[:] cu.LaunchKernel(settemperature2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("settemperature2") } } // maps compute capability on PTX code for settemperature2 kernel. var settemperature2_map = map[int]string{0: "", 50: settemperature2_ptx_50, 52: settemperature2_ptx_52, 53: settemperature2_ptx_53, 60: settemperature2_ptx_60, 61: settemperature2_ptx_61, 62: settemperature2_ptx_62, 70: settemperature2_ptx_70, 72: settemperature2_ptx_72, 75: settemperature2_ptx_75, 80: settemperature2_ptx_80, 86: settemperature2_ptx_86, 87: settemperature2_ptx_87, 89: settemperature2_ptx_89, 90: settemperature2_ptx_90} // settemperature2 PTX code for various compute capabilities. const ( settemperature2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` settemperature2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; $L__BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra $L__BB0_5; rcp.rn.f32 %f24, %f23; $L__BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; $L__BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra $L__BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; $L__BB0_9: cvta.to.global.u64 %rd15, %rd2; mul.wide.s32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd17]; mul.f32 %f22, %f21, %f20; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd16; st.global.f32 [%rd19], %f22; $L__BB0_10: ret; } ` ) 3-3.11.1/cuda/theta.cu000066400000000000000000000007221503346766200143260ustar00rootroot00000000000000#include "stencil.h" extern "C" __global__ void setTheta(float* __restrict__ theta, float* __restrict__ mz, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index theta[I] = acosf(mz[I]); }3-3.11.1/cuda/theta_wrapper.go000066400000000000000000001141351503346766200160700ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setTheta kernel var setTheta_code cu.Function // Stores the arguments for setTheta kernel invocation type setTheta_args_t struct { arg_theta unsafe.Pointer arg_mz unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for setTheta kernel invocation var setTheta_args setTheta_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setTheta_args.argptr[0] = unsafe.Pointer(&setTheta_args.arg_theta) setTheta_args.argptr[1] = unsafe.Pointer(&setTheta_args.arg_mz) setTheta_args.argptr[2] = unsafe.Pointer(&setTheta_args.arg_Nx) setTheta_args.argptr[3] = unsafe.Pointer(&setTheta_args.arg_Ny) setTheta_args.argptr[4] = unsafe.Pointer(&setTheta_args.arg_Nz) } // Wrapper for setTheta CUDA kernel, asynchronous. func k_setTheta_async(theta unsafe.Pointer, mz unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("setTheta") } setTheta_args.Lock() defer setTheta_args.Unlock() if setTheta_code == 0 { setTheta_code = fatbinLoad(setTheta_map, "setTheta") } setTheta_args.arg_theta = theta setTheta_args.arg_mz = mz setTheta_args.arg_Nx = Nx setTheta_args.arg_Ny = Ny setTheta_args.arg_Nz = Nz args := setTheta_args.argptr[:] cu.LaunchKernel(setTheta_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setTheta") } } // maps compute capability on PTX code for setTheta kernel. var setTheta_map = map[int]string{0: "", 50: setTheta_ptx_50, 52: setTheta_ptx_52, 53: setTheta_ptx_53, 60: setTheta_ptx_60, 61: setTheta_ptx_61, 62: setTheta_ptx_62, 70: setTheta_ptx_70, 72: setTheta_ptx_72, 75: setTheta_ptx_75, 80: setTheta_ptx_80, 86: setTheta_ptx_86, 87: setTheta_ptx_87, 89: setTheta_ptx_89, 90: setTheta_ptx_90} // setTheta PTX code for various compute capabilities. const ( setTheta_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` setTheta_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<9>; .reg .f32 %f<37>; .reg .b32 %r<22>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r11, %r10, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r14, %r13, %r15; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; neg.f32 %f3, %f2; mov.f32 %f4, 0f3F000000; fma.rn.f32 %f5, %f4, %f3, %f4; rsqrt.approx.ftz.f32 %f6, %f5; mul.f32 %f7, %f5, %f6; mul.f32 %f8, %f6, 0f3F000000; neg.f32 %f9, %f7; fma.rn.f32 %f10, %f9, %f8, %f4; fma.rn.f32 %f11, %f7, %f10, %f7; setp.eq.f32 %p6, %f2, 0f3F800000; selp.f32 %f12, 0f00000000, %f11, %p6; setp.gt.f32 %p7, %f2, 0f3F0F5C29; selp.f32 %f13, %f12, %f2, %p7; mov.b32 %r18, %f13; mov.b32 %r19, %f1; and.b32 %r20, %r19, -2147483648; or.b32 %r21, %r20, %r18; mov.b32 %f14, %r21; mul.f32 %f15, %f14, %f14; mov.f32 %f16, 0f3C8B1ABB; mov.f32 %f17, 0f3D10ECEF; fma.rn.f32 %f18, %f17, %f15, %f16; mov.f32 %f19, 0f3CFC028C; fma.rn.f32 %f20, %f18, %f15, %f19; mov.f32 %f21, 0f3D372139; fma.rn.f32 %f22, %f20, %f15, %f21; mov.f32 %f23, 0f3D9993DB; fma.rn.f32 %f24, %f22, %f15, %f23; mov.f32 %f25, 0f3E2AAAC6; fma.rn.f32 %f26, %f24, %f15, %f25; mul.f32 %f27, %f26, %f15; fma.rn.f32 %f28, %f27, %f14, %f14; neg.f32 %f29, %f28; selp.f32 %f30, %f28, %f29, %p7; mov.f32 %f31, 0f3FD774EB; mov.f32 %f32, 0f3F6EE581; fma.rn.f32 %f33, %f32, %f31, %f30; setp.gt.f32 %p8, %f1, 0f3F0F5C29; selp.f32 %f34, %f28, %f33, %p8; add.f32 %f35, %f34, %f34; selp.f32 %f36, %f35, %f34, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f36; $L__BB0_2: ret; } ` ) 3-3.11.1/cuda/topologicalcharge.cu000066400000000000000000000125541503346766200167150ustar00rootroot00000000000000#include #include #include "exchange.h" #include "float3.h" #include "stencil.h" // Set s to the topological charge density. // See topologicalcharge.go. extern "C" __global__ void settopologicalcharge(float* __restrict__ s, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float icxcy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[I], my[I], mz[I]); // +0 float3 dmdx = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂x float3 dmdy = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdx_x_dmdy = make_float3(0.0, 0.0, 0.0); // ∂m/∂x ❌ ∂m/∂y int i_; // neighbor index if(is0(m0)) { s[I] = 0.0f; return; } // x derivatives (along length) { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); // -2 i_ = idx(lclampx(ix-2), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-2 >= 0 || PBCx) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); // -1 i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); // +1 i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); // +2 i_ = idx(hclampx(ix+2), iy, iz); if (ix+2 < Nx || PBCx) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdx = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdx = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdx = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdx = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdx = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdx = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdx = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // y derivatives (along height) { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-2), iz); if (iy-2 >= 0 || PBCy) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+2), iz); if (iy+2 < Ny || PBCy) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdy = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdy = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdy = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdy = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdy = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdy = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdy = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } dmdx_x_dmdy = cross(dmdx, dmdy); s[I] = icxcy * dot(m0, dmdx_x_dmdy); } 3-3.11.1/cuda/topologicalcharge.go000066400000000000000000000010431503346766200167020ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Set s to the toplogogical charge density s = m · (∂m/∂x ❌ ∂m/∂y) // See topologicalcharge.cu func SetTopologicalCharge(s *data.Slice, m *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) icxcy := float32(1.0 / (cellsize[X] * cellsize[Y])) k_settopologicalcharge_async(s.DevPtr(X), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), icxcy, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } 3-3.11.1/cuda/topologicalcharge_wrapper.go000066400000000000000000006623711503346766200204630ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for settopologicalcharge kernel var settopologicalcharge_code cu.Function // Stores the arguments for settopologicalcharge kernel invocation type settopologicalcharge_args_t struct { arg_s unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_icxcy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for settopologicalcharge kernel invocation var settopologicalcharge_args settopologicalcharge_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. settopologicalcharge_args.argptr[0] = unsafe.Pointer(&settopologicalcharge_args.arg_s) settopologicalcharge_args.argptr[1] = unsafe.Pointer(&settopologicalcharge_args.arg_mx) settopologicalcharge_args.argptr[2] = unsafe.Pointer(&settopologicalcharge_args.arg_my) settopologicalcharge_args.argptr[3] = unsafe.Pointer(&settopologicalcharge_args.arg_mz) settopologicalcharge_args.argptr[4] = unsafe.Pointer(&settopologicalcharge_args.arg_icxcy) settopologicalcharge_args.argptr[5] = unsafe.Pointer(&settopologicalcharge_args.arg_Nx) settopologicalcharge_args.argptr[6] = unsafe.Pointer(&settopologicalcharge_args.arg_Ny) settopologicalcharge_args.argptr[7] = unsafe.Pointer(&settopologicalcharge_args.arg_Nz) settopologicalcharge_args.argptr[8] = unsafe.Pointer(&settopologicalcharge_args.arg_PBC) } // Wrapper for settopologicalcharge CUDA kernel, asynchronous. func k_settopologicalcharge_async(s unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("settopologicalcharge") } settopologicalcharge_args.Lock() defer settopologicalcharge_args.Unlock() if settopologicalcharge_code == 0 { settopologicalcharge_code = fatbinLoad(settopologicalcharge_map, "settopologicalcharge") } settopologicalcharge_args.arg_s = s settopologicalcharge_args.arg_mx = mx settopologicalcharge_args.arg_my = my settopologicalcharge_args.arg_mz = mz settopologicalcharge_args.arg_icxcy = icxcy settopologicalcharge_args.arg_Nx = Nx settopologicalcharge_args.arg_Ny = Ny settopologicalcharge_args.arg_Nz = Nz settopologicalcharge_args.arg_PBC = PBC args := settopologicalcharge_args.argptr[:] cu.LaunchKernel(settopologicalcharge_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("settopologicalcharge") } } // maps compute capability on PTX code for settopologicalcharge kernel. var settopologicalcharge_map = map[int]string{0: "", 50: settopologicalcharge_ptx_50, 52: settopologicalcharge_ptx_52, 53: settopologicalcharge_ptx_53, 60: settopologicalcharge_ptx_60, 61: settopologicalcharge_ptx_61, 62: settopologicalcharge_ptx_62, 70: settopologicalcharge_ptx_70, 72: settopologicalcharge_ptx_72, 75: settopologicalcharge_ptx_75, 80: settopologicalcharge_ptx_80, 86: settopologicalcharge_ptx_86, 87: settopologicalcharge_ptx_87, 89: settopologicalcharge_ptx_89, 90: settopologicalcharge_ptx_90} // settopologicalcharge PTX code for various compute capabilities. const ( settopologicalcharge_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` settopologicalcharge_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<79>; .reg .b16 %rs<4>; .reg .f32 %f<315>; .reg .b32 %r<93>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalcharge_param_8]; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f138, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r44, %r43, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r47, %r46, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r50, %r49, %r51; setp.ge.s32 %p1, %r1, %r40; setp.ge.s32 %p2, %r2, %r41; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r42; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_72; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd9, %r53, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f139, %f2, %f2; fma.rn.f32 %f140, %f1, %f1, %f139; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f141, %f3, %f3, %f140; setp.eq.f32 %p6, %f141, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p6 bra $L__BB0_71; bra.uni $L__BB0_2; $L__BB0_71: mov.u32 %r84, 0; st.global.u32 [%rd4], %r84; bra.uni $L__BB0_72; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p7, %rs1, 0; add.s32 %r6, %r1, -2; @%p7 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: max.s32 %r85, %r6, 0; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r85, %r55, %r40; $L__BB0_5: setp.lt.s32 %p9, %r1, 2; mov.f32 %f7, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p10 bra $L__BB0_7; add.s32 %r56, %r85, %r5; mul.wide.s32 %rd14, %r56, 4; add.s64 %rd15, %rd3, %rd14; add.s64 %rd16, %rd2, %rd14; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; ld.global.nc.f32 %f8, [%rd16]; ld.global.nc.f32 %f7, [%rd15]; $L__BB0_7: add.s32 %r10, %r1, -1; @%p7 bra $L__BB0_9; bra.uni $L__BB0_8; $L__BB0_9: max.s32 %r86, %r10, 0; bra.uni $L__BB0_10; $L__BB0_8: rem.s32 %r57, %r10, %r40; add.s32 %r58, %r57, %r40; rem.s32 %r86, %r58, %r40; $L__BB0_10: setp.lt.s32 %p12, %r1, 1; mov.f32 %f40, 0f00000000; and.pred %p14, %p12, %p7; mov.f32 %f39, %f40; mov.f32 %f38, %f40; @%p14 bra $L__BB0_12; add.s32 %r59, %r86, %r5; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; add.s64 %rd20, %rd2, %rd18; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f38, [%rd21]; ld.global.nc.f32 %f39, [%rd20]; ld.global.nc.f32 %f40, [%rd19]; $L__BB0_12: add.s32 %r14, %r1, 1; @%p7 bra $L__BB0_14; bra.uni $L__BB0_13; $L__BB0_14: add.s32 %r62, %r40, -1; min.s32 %r87, %r14, %r62; bra.uni $L__BB0_15; $L__BB0_13: rem.s32 %r60, %r14, %r40; add.s32 %r61, %r60, %r40; rem.s32 %r87, %r61, %r40; $L__BB0_15: setp.ge.s32 %p16, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p18, %p16, %p7; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p18 bra $L__BB0_17; add.s32 %r63, %r87, %r5; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; add.s64 %rd24, %rd2, %rd22; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; ld.global.nc.f32 %f20, [%rd24]; ld.global.nc.f32 %f19, [%rd23]; $L__BB0_17: add.s32 %r18, %r1, 2; @%p7 bra $L__BB0_19; bra.uni $L__BB0_18; $L__BB0_19: add.s32 %r66, %r40, -1; min.s32 %r88, %r18, %r66; bra.uni $L__BB0_20; $L__BB0_18: rem.s32 %r64, %r18, %r40; add.s32 %r65, %r64, %r40; rem.s32 %r88, %r65, %r40; $L__BB0_20: add.s32 %r22, %r88, %r5; setp.ge.s32 %p20, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p22, %p20, %p7; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p22 bra $L__BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; add.s64 %rd28, %rd2, %rd26; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; ld.global.nc.f32 %f26, [%rd28]; ld.global.nc.f32 %f25, [%rd27]; $L__BB0_22: mul.f32 %f154, %f20, %f20; fma.rn.f32 %f155, %f19, %f19, %f154; fma.rn.f32 %f31, %f21, %f21, %f155; setp.eq.f32 %p23, %f31, 0f00000000; @%p23 bra $L__BB0_23; bra.uni $L__BB0_24; $L__BB0_23: mul.f32 %f159, %f39, %f39; fma.rn.f32 %f160, %f40, %f40, %f159; fma.rn.f32 %f161, %f38, %f38, %f160; setp.eq.f32 %p24, %f161, 0f00000000; mov.f32 %f294, 0f00000000; mov.f32 %f295, %f294; mov.f32 %f296, %f294; @%p24 bra $L__BB0_36; $L__BB0_24: mul.f32 %f162, %f8, %f8; fma.rn.f32 %f163, %f7, %f7, %f162; fma.rn.f32 %f44, %f9, %f9, %f163; setp.neu.f32 %p25, %f44, 0f00000000; mul.f32 %f164, %f26, %f26; fma.rn.f32 %f165, %f25, %f25, %f164; fma.rn.f32 %f48, %f27, %f27, %f165; setp.neu.f32 %p26, %f48, 0f00000000; and.pred %p27, %p25, %p26; or.pred %p29, %p23, %p27; @%p29 bra $L__BB0_26; mul.f32 %f166, %f39, %f39; fma.rn.f32 %f167, %f40, %f40, %f166; fma.rn.f32 %f168, %f38, %f38, %f167; setp.neu.f32 %p30, %f168, 0f00000000; @%p30 bra $L__BB0_35; bra.uni $L__BB0_26; $L__BB0_35: sub.f32 %f201, %f19, %f40; sub.f32 %f202, %f20, %f39; sub.f32 %f203, %f21, %f38; mul.f32 %f296, %f203, 0f3F000000; mul.f32 %f295, %f202, 0f3F000000; mul.f32 %f294, %f201, 0f3F000000; bra.uni $L__BB0_36; $L__BB0_26: setp.eq.f32 %p31, %f44, 0f00000000; and.pred %p33, %p31, %p23; @%p33 bra $L__BB0_34; bra.uni $L__BB0_27; $L__BB0_34: sub.f32 %f296, %f3, %f38; sub.f32 %f295, %f2, %f39; sub.f32 %f294, %f1, %f40; bra.uni $L__BB0_36; $L__BB0_27: setp.eq.f32 %p34, %f48, 0f00000000; mul.f32 %f169, %f39, %f39; fma.rn.f32 %f170, %f40, %f40, %f169; fma.rn.f32 %f49, %f38, %f38, %f170; setp.eq.f32 %p35, %f49, 0f00000000; and.pred %p36, %p35, %p34; @%p36 bra $L__BB0_33; bra.uni $L__BB0_28; $L__BB0_33: sub.f32 %f296, %f21, %f3; sub.f32 %f295, %f20, %f2; sub.f32 %f294, %f19, %f1; bra.uni $L__BB0_36; $L__BB0_28: setp.neu.f32 %p38, %f31, 0f00000000; or.pred %p39, %p31, %p38; @%p39 bra $L__BB0_30; bra.uni $L__BB0_29; $L__BB0_30: setp.neu.f32 %p40, %f49, 0f00000000; or.pred %p42, %p34, %p40; @%p42 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: sub.f32 %f192, %f19, %f40; sub.f32 %f193, %f20, %f39; sub.f32 %f194, %f21, %f38; sub.f32 %f195, %f7, %f25; mul.f32 %f196, %f195, 0f3DAAAAAB; sub.f32 %f197, %f8, %f26; mul.f32 %f198, %f197, 0f3DAAAAAB; sub.f32 %f199, %f9, %f27; mul.f32 %f200, %f199, 0f3DAAAAAB; fma.rn.f32 %f296, %f194, 0f3F2AAAAB, %f200; fma.rn.f32 %f295, %f193, 0f3F2AAAAB, %f198; fma.rn.f32 %f294, %f192, 0f3F2AAAAB, %f196; bra.uni $L__BB0_36; $L__BB0_29: mul.f32 %f171, %f7, 0f3F000000; add.f32 %f172, %f40, %f40; sub.f32 %f173, %f171, %f172; add.f32 %f174, %f39, %f39; mul.f32 %f175, %f8, 0f3F000000; sub.f32 %f176, %f175, %f174; add.f32 %f177, %f38, %f38; mul.f32 %f178, %f9, 0f3F000000; sub.f32 %f179, %f178, %f177; fma.rn.f32 %f296, %f3, 0f3FC00000, %f179; fma.rn.f32 %f295, %f2, 0f3FC00000, %f176; fma.rn.f32 %f294, %f1, 0f3FC00000, %f173; bra.uni $L__BB0_36; $L__BB0_31: mul.f32 %f180, %f25, 0f3F000000; add.f32 %f181, %f19, %f19; sub.f32 %f182, %f181, %f180; add.f32 %f183, %f20, %f20; mul.f32 %f184, %f26, 0f3F000000; sub.f32 %f185, %f183, %f184; add.f32 %f186, %f21, %f21; mul.f32 %f187, %f27, 0f3F000000; sub.f32 %f188, %f186, %f187; mul.f32 %f189, %f1, 0f3FC00000; mul.f32 %f190, %f2, 0f3FC00000; mul.f32 %f191, %f3, 0f3FC00000; sub.f32 %f296, %f188, %f191; sub.f32 %f295, %f185, %f190; sub.f32 %f294, %f182, %f189; $L__BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra $L__BB0_38; bra.uni $L__BB0_37; $L__BB0_38: max.s32 %r89, %r23, 0; bra.uni $L__BB0_39; $L__BB0_37: rem.s32 %r67, %r23, %r41; add.s32 %r68, %r67, %r41; rem.s32 %r89, %r68, %r41; $L__BB0_39: setp.lt.s32 %p45, %r2, 2; mov.f32 %f74, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p46 bra $L__BB0_41; add.s32 %r69, %r89, %r4; mad.lo.s32 %r70, %r69, %r40, %r1; mul.wide.s32 %rd30, %r70, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f76, [%rd33]; ld.global.nc.f32 %f75, [%rd32]; ld.global.nc.f32 %f74, [%rd31]; $L__BB0_41: add.s32 %r27, %r2, -1; @%p43 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r90, %r27, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r71, %r27, %r41; add.s32 %r72, %r71, %r41; rem.s32 %r90, %r72, %r41; $L__BB0_44: setp.lt.s32 %p48, %r2, 1; mov.f32 %f107, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f106, %f107; mov.f32 %f105, %f107; @%p50 bra $L__BB0_46; add.s32 %r73, %r90, %r4; mad.lo.s32 %r74, %r73, %r40, %r1; mul.wide.s32 %rd34, %r74, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f105, [%rd37]; ld.global.nc.f32 %f106, [%rd36]; ld.global.nc.f32 %f107, [%rd35]; $L__BB0_46: add.s32 %r31, %r2, 1; @%p43 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: add.s32 %r77, %r41, -1; min.s32 %r91, %r31, %r77; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r75, %r31, %r41; add.s32 %r76, %r75, %r41; rem.s32 %r91, %r76, %r41; $L__BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f86, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f87, %f86; mov.f32 %f88, %f86; @%p54 bra $L__BB0_51; add.s32 %r78, %r91, %r4; mad.lo.s32 %r79, %r78, %r40, %r1; mul.wide.s32 %rd38, %r79, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f88, [%rd41]; ld.global.nc.f32 %f87, [%rd40]; ld.global.nc.f32 %f86, [%rd39]; $L__BB0_51: add.s32 %r35, %r2, 2; @%p43 bra $L__BB0_53; bra.uni $L__BB0_52; $L__BB0_53: add.s32 %r82, %r41, -1; min.s32 %r92, %r35, %r82; bra.uni $L__BB0_54; $L__BB0_52: rem.s32 %r80, %r35, %r41; add.s32 %r81, %r80, %r41; rem.s32 %r92, %r81, %r41; $L__BB0_54: add.s32 %r83, %r92, %r4; mad.lo.s32 %r39, %r83, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f92, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f93, %f92; mov.f32 %f94, %f92; @%p58 bra $L__BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f94, [%rd45]; ld.global.nc.f32 %f93, [%rd44]; ld.global.nc.f32 %f92, [%rd43]; $L__BB0_56: mul.f32 %f216, %f87, %f87; fma.rn.f32 %f217, %f86, %f86, %f216; fma.rn.f32 %f98, %f88, %f88, %f217; setp.eq.f32 %p59, %f98, 0f00000000; @%p59 bra $L__BB0_57; bra.uni $L__BB0_58; $L__BB0_57: mul.f32 %f221, %f106, %f106; fma.rn.f32 %f222, %f107, %f107, %f221; fma.rn.f32 %f223, %f105, %f105, %f222; setp.eq.f32 %p60, %f223, 0f00000000; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; @%p60 bra $L__BB0_70; $L__BB0_58: mul.f32 %f224, %f75, %f75; fma.rn.f32 %f225, %f74, %f74, %f224; fma.rn.f32 %f111, %f76, %f76, %f225; setp.neu.f32 %p61, %f111, 0f00000000; mul.f32 %f226, %f93, %f93; fma.rn.f32 %f227, %f92, %f92, %f226; fma.rn.f32 %f115, %f94, %f94, %f227; setp.neu.f32 %p62, %f115, 0f00000000; and.pred %p63, %p61, %p62; or.pred %p65, %p59, %p63; @%p65 bra $L__BB0_60; mul.f32 %f228, %f106, %f106; fma.rn.f32 %f229, %f107, %f107, %f228; fma.rn.f32 %f230, %f105, %f105, %f229; setp.neu.f32 %p66, %f230, 0f00000000; @%p66 bra $L__BB0_69; bra.uni $L__BB0_60; $L__BB0_69: sub.f32 %f263, %f86, %f107; sub.f32 %f264, %f87, %f106; sub.f32 %f265, %f88, %f105; mul.f32 %f314, %f265, 0f3F000000; mul.f32 %f313, %f264, 0f3F000000; mul.f32 %f312, %f263, 0f3F000000; bra.uni $L__BB0_70; $L__BB0_60: setp.eq.f32 %p67, %f111, 0f00000000; and.pred %p69, %p67, %p59; @%p69 bra $L__BB0_68; bra.uni $L__BB0_61; $L__BB0_68: sub.f32 %f314, %f3, %f105; sub.f32 %f313, %f2, %f106; sub.f32 %f312, %f1, %f107; bra.uni $L__BB0_70; $L__BB0_61: setp.eq.f32 %p70, %f115, 0f00000000; mul.f32 %f231, %f106, %f106; fma.rn.f32 %f232, %f107, %f107, %f231; fma.rn.f32 %f116, %f105, %f105, %f232; setp.eq.f32 %p71, %f116, 0f00000000; and.pred %p72, %p71, %p70; @%p72 bra $L__BB0_67; bra.uni $L__BB0_62; $L__BB0_67: sub.f32 %f314, %f88, %f3; sub.f32 %f313, %f87, %f2; sub.f32 %f312, %f86, %f1; bra.uni $L__BB0_70; $L__BB0_62: setp.neu.f32 %p74, %f98, 0f00000000; or.pred %p75, %p67, %p74; @%p75 bra $L__BB0_64; bra.uni $L__BB0_63; $L__BB0_64: setp.neu.f32 %p76, %f116, 0f00000000; or.pred %p78, %p70, %p76; @%p78 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: sub.f32 %f254, %f86, %f107; sub.f32 %f255, %f87, %f106; sub.f32 %f256, %f88, %f105; sub.f32 %f257, %f74, %f92; mul.f32 %f258, %f257, 0f3DAAAAAB; sub.f32 %f259, %f75, %f93; mul.f32 %f260, %f259, 0f3DAAAAAB; sub.f32 %f261, %f76, %f94; mul.f32 %f262, %f261, 0f3DAAAAAB; fma.rn.f32 %f314, %f256, 0f3F2AAAAB, %f262; fma.rn.f32 %f313, %f255, 0f3F2AAAAB, %f260; fma.rn.f32 %f312, %f254, 0f3F2AAAAB, %f258; bra.uni $L__BB0_70; $L__BB0_63: mul.f32 %f233, %f74, 0f3F000000; add.f32 %f234, %f107, %f107; sub.f32 %f235, %f233, %f234; add.f32 %f236, %f106, %f106; mul.f32 %f237, %f75, 0f3F000000; sub.f32 %f238, %f237, %f236; add.f32 %f239, %f105, %f105; mul.f32 %f240, %f76, 0f3F000000; sub.f32 %f241, %f240, %f239; fma.rn.f32 %f314, %f3, 0f3FC00000, %f241; fma.rn.f32 %f313, %f2, 0f3FC00000, %f238; fma.rn.f32 %f312, %f1, 0f3FC00000, %f235; bra.uni $L__BB0_70; $L__BB0_65: mul.f32 %f242, %f92, 0f3F000000; add.f32 %f243, %f86, %f86; sub.f32 %f244, %f243, %f242; add.f32 %f245, %f87, %f87; mul.f32 %f246, %f93, 0f3F000000; sub.f32 %f247, %f245, %f246; add.f32 %f248, %f88, %f88; mul.f32 %f249, %f94, 0f3F000000; sub.f32 %f250, %f248, %f249; mul.f32 %f251, %f1, 0f3FC00000; mul.f32 %f252, %f2, 0f3FC00000; mul.f32 %f253, %f3, 0f3FC00000; sub.f32 %f314, %f250, %f253; sub.f32 %f313, %f247, %f252; sub.f32 %f312, %f244, %f251; $L__BB0_70: mul.f32 %f266, %f295, %f314; mul.f32 %f267, %f296, %f313; sub.f32 %f268, %f266, %f267; mul.f32 %f269, %f296, %f312; mul.f32 %f270, %f294, %f314; sub.f32 %f271, %f269, %f270; mul.f32 %f272, %f294, %f313; mul.f32 %f273, %f295, %f312; sub.f32 %f274, %f272, %f273; mul.f32 %f275, %f2, %f271; fma.rn.f32 %f276, %f1, %f268, %f275; fma.rn.f32 %f277, %f3, %f274, %f276; mul.f32 %f278, %f277, %f138; st.global.f32 [%rd4], %f278; $L__BB0_72: ret; } ` ) 3-3.11.1/cuda/topologicalchargelattice.cu000066400000000000000000000073021503346766200202560ustar00rootroot00000000000000#include #include #include #include "exchange.h" #include "float3.h" #include "stencil.h" // Returns the topological charge contribution on an elementary triangle ijk // Order of arguments is important here to preserve the same measure of chirality // Note: the result is zero if an argument is zero, or when two arguments are the same __device__ inline float triangleCharge(float3 mi, float3 mj, float3 mk) { float numer = dot(mi, cross(mj, mk)); float denom = 1.0f + dot(mi, mj) + dot(mi, mk) + dot(mj, mk); return 2.0f * atan2(numer, denom); } // Set s to the toplogogical charge density for lattices based on the solid angle // subtended by triangle associated with three spins: a,b,c // // s = 2 atan[(a . b x c /(1 + a.b + a.c + b.c)] / (dx dy) // // After M Boettcher et al, New J Phys 20, 103014 (2018), adapted from // B. Berg and M. Luescher, Nucl. Phys. B 190, 412 (1981), and implemented by // Joo-Von Kim. // // A unit cell comprises two triangles, but s is a site-dependent quantity so we // double-count and average over four triangles. extern "C" __global__ void settopologicalchargelattice(float* __restrict__ s, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float icxcy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int i0 = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[i0], my[i0], mz[i0]); // central cell magnetization if(is0(m0)) { s[i0] = 0.0f; return; } // indices of the 4 neighbors (counter clockwise) int i1 = idx(hclampx(ix+1), iy, iz); // (i+1,j) int i2 = idx(ix, hclampy(iy+1), iz); // (i,j+1) int i3 = idx(lclampx(ix-1), iy, iz); // (i-1,j) int i4 = idx(ix, lclampy(iy-1), iz); // (i,j-1) // magnetization of the 4 neighbors float3 m1 = make_float3(mx[i1], my[i1], mz[i1]); float3 m2 = make_float3(mx[i2], my[i2], mz[i2]); float3 m3 = make_float3(mx[i3], my[i3], mz[i3]); float3 m4 = make_float3(mx[i4], my[i4], mz[i4]); // local topological charge (accumulator) float topcharge = 0.0; // charge contribution from the upper right triangle // if diagonally opposite neighbor is not zero, use a weight of 1/2 to avoid counting charges twice if ((ix+1=0 || PBCx) && (iy+1=0 || PBCx) && (iy-1>=0 || PBCy)) { int i_ = idx(lclampx(ix-1), lclampy(iy-1), iz); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; topcharge += weight * triangleCharge(m0, m3, m4); } // bottom right if ((ix+1=0 || PBCy)) { int i_ = idx(hclampx(ix+1), lclampy(iy-1), iz); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; topcharge += weight * triangleCharge(m0, m4, m1); } s[i0] = icxcy * topcharge; } 3-3.11.1/cuda/topologicalchargelattice.go000066400000000000000000000007731503346766200202610ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Topological charge according to Berg and Lüscher func SetTopologicalChargeLattice(s *data.Slice, m *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) icxcy := float32(1.0 / (cellsize[X] * cellsize[Y])) k_settopologicalchargelattice_async(s.DevPtr(X), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), icxcy, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } 3-3.11.1/cuda/topologicalchargelattice_wrapper.go000066400000000000000000010730161503346766200220220ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for settopologicalchargelattice kernel var settopologicalchargelattice_code cu.Function // Stores the arguments for settopologicalchargelattice kernel invocation type settopologicalchargelattice_args_t struct { arg_s unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_icxcy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for settopologicalchargelattice kernel invocation var settopologicalchargelattice_args settopologicalchargelattice_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. settopologicalchargelattice_args.argptr[0] = unsafe.Pointer(&settopologicalchargelattice_args.arg_s) settopologicalchargelattice_args.argptr[1] = unsafe.Pointer(&settopologicalchargelattice_args.arg_mx) settopologicalchargelattice_args.argptr[2] = unsafe.Pointer(&settopologicalchargelattice_args.arg_my) settopologicalchargelattice_args.argptr[3] = unsafe.Pointer(&settopologicalchargelattice_args.arg_mz) settopologicalchargelattice_args.argptr[4] = unsafe.Pointer(&settopologicalchargelattice_args.arg_icxcy) settopologicalchargelattice_args.argptr[5] = unsafe.Pointer(&settopologicalchargelattice_args.arg_Nx) settopologicalchargelattice_args.argptr[6] = unsafe.Pointer(&settopologicalchargelattice_args.arg_Ny) settopologicalchargelattice_args.argptr[7] = unsafe.Pointer(&settopologicalchargelattice_args.arg_Nz) settopologicalchargelattice_args.argptr[8] = unsafe.Pointer(&settopologicalchargelattice_args.arg_PBC) } // Wrapper for settopologicalchargelattice CUDA kernel, asynchronous. func k_settopologicalchargelattice_async(s unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("settopologicalchargelattice") } settopologicalchargelattice_args.Lock() defer settopologicalchargelattice_args.Unlock() if settopologicalchargelattice_code == 0 { settopologicalchargelattice_code = fatbinLoad(settopologicalchargelattice_map, "settopologicalchargelattice") } settopologicalchargelattice_args.arg_s = s settopologicalchargelattice_args.arg_mx = mx settopologicalchargelattice_args.arg_my = my settopologicalchargelattice_args.arg_mz = mz settopologicalchargelattice_args.arg_icxcy = icxcy settopologicalchargelattice_args.arg_Nx = Nx settopologicalchargelattice_args.arg_Ny = Ny settopologicalchargelattice_args.arg_Nz = Nz settopologicalchargelattice_args.arg_PBC = PBC args := settopologicalchargelattice_args.argptr[:] cu.LaunchKernel(settopologicalchargelattice_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("settopologicalchargelattice") } } // maps compute capability on PTX code for settopologicalchargelattice kernel. var settopologicalchargelattice_map = map[int]string{0: "", 50: settopologicalchargelattice_ptx_50, 52: settopologicalchargelattice_ptx_52, 53: settopologicalchargelattice_ptx_53, 60: settopologicalchargelattice_ptx_60, 61: settopologicalchargelattice_ptx_61, 62: settopologicalchargelattice_ptx_62, 70: settopologicalchargelattice_ptx_70, 72: settopologicalchargelattice_ptx_72, 75: settopologicalchargelattice_ptx_75, 80: settopologicalchargelattice_ptx_80, 86: settopologicalchargelattice_ptx_86, 87: settopologicalchargelattice_ptx_87, 89: settopologicalchargelattice_ptx_89, 90: settopologicalchargelattice_ptx_90} // settopologicalchargelattice PTX code for various compute capabilities. const ( settopologicalchargelattice_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` settopologicalchargelattice_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<4>; .reg .f32 %f<296>; .reg .b32 %r<181>; .reg .b64 %rd<46>; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f60, [settopologicalchargelattice_param_4]; ld.param.u32 %r49, [settopologicalchargelattice_param_5]; ld.param.u32 %r50, [settopologicalchargelattice_param_6]; ld.param.u32 %r51, [settopologicalchargelattice_param_7]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r52, %ntid.x; mov.u32 %r53, %ctaid.x; mov.u32 %r54, %tid.x; mad.lo.s32 %r1, %r53, %r52, %r54; mov.u32 %r55, %ntid.y; mov.u32 %r56, %ctaid.y; mov.u32 %r57, %tid.y; mad.lo.s32 %r2, %r56, %r55, %r57; mov.u32 %r58, %ntid.z; mov.u32 %r59, %ctaid.z; mov.u32 %r60, %tid.z; mad.lo.s32 %r3, %r59, %r58, %r60; setp.ge.s32 %p2, %r1, %r49; setp.ge.s32 %p3, %r2, %r50; or.pred %p4, %p2, %p3; setp.ge.s32 %p5, %r3, %r51; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_71; mul.lo.s32 %r4, %r3, %r50; add.s32 %r61, %r4, %r2; mul.lo.s32 %r5, %r61, %r49; add.s32 %r62, %r5, %r1; mul.wide.s32 %rd9, %r62, 4; add.s64 %rd10, %rd3, %rd9; add.s64 %rd11, %rd2, %rd9; add.s64 %rd12, %rd1, %rd9; ld.global.nc.f32 %f1, [%rd10]; ld.global.nc.f32 %f2, [%rd11]; mul.f32 %f61, %f2, %f2; fma.rn.f32 %f62, %f1, %f1, %f61; ld.global.nc.f32 %f3, [%rd12]; fma.rn.f32 %f63, %f3, %f3, %f62; setp.eq.f32 %p7, %f63, 0f00000000; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd4, %rd13, %rd9; @%p7 bra $L__BB0_70; bra.uni $L__BB0_2; $L__BB0_70: mov.u32 %r168, 0; st.global.u32 [%rd4], %r168; bra.uni $L__BB0_71; $L__BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p8, %rs1, 0; add.s32 %r6, %r1, 1; @%p8 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: add.s32 %r65, %r49, -1; min.s32 %r169, %r6, %r65; bra.uni $L__BB0_5; $L__BB0_3: rem.s32 %r63, %r6, %r49; add.s32 %r64, %r63, %r49; rem.s32 %r169, %r64, %r49; $L__BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p9, %rs2, 0; add.s32 %r10, %r2, 1; @%p9 bra $L__BB0_7; bra.uni $L__BB0_6; $L__BB0_7: add.s32 %r68, %r50, -1; min.s32 %r170, %r10, %r68; bra.uni $L__BB0_8; $L__BB0_6: rem.s32 %r66, %r10, %r50; add.s32 %r67, %r66, %r50; rem.s32 %r170, %r67, %r50; $L__BB0_8: add.s32 %r14, %r1, -1; @%p8 bra $L__BB0_10; bra.uni $L__BB0_9; $L__BB0_10: max.s32 %r171, %r14, 0; bra.uni $L__BB0_11; $L__BB0_9: rem.s32 %r69, %r14, %r49; add.s32 %r70, %r69, %r49; rem.s32 %r171, %r70, %r49; $L__BB0_11: add.s32 %r18, %r171, %r5; add.s32 %r19, %r2, -1; add.s32 %r20, %r169, %r5; @%p9 bra $L__BB0_13; bra.uni $L__BB0_12; $L__BB0_13: max.s32 %r172, %r19, 0; bra.uni $L__BB0_14; $L__BB0_12: rem.s32 %r71, %r19, %r50; add.s32 %r72, %r71, %r50; rem.s32 %r172, %r72, %r50; $L__BB0_14: add.s32 %r73, %r172, %r4; mad.lo.s32 %r74, %r73, %r49, %r1; mul.wide.s32 %rd14, %r20, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; add.s32 %r75, %r170, %r4; mad.lo.s32 %r76, %r75, %r49, %r1; mul.wide.s32 %rd18, %r76, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r18, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r74, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p12, %rs1, 0; setp.lt.s32 %p13, %r6, %r49; or.pred %p1, %p13, %p12; not.pred %p14, %p1; mov.f32 %f293, 0f00000000; @%p14 bra $L__BB0_28; setp.ge.s32 %p15, %r10, %r50; and.pred %p17, %p15, %p9; @%p17 bra $L__BB0_28; @%p9 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: add.s32 %r79, %r50, -1; min.s32 %r173, %r10, %r79; bra.uni $L__BB0_19; $L__BB0_17: rem.s32 %r77, %r10, %r50; add.s32 %r78, %r77, %r50; rem.s32 %r173, %r78, %r50; $L__BB0_19: @%p8 bra $L__BB0_21; bra.uni $L__BB0_20; $L__BB0_21: add.s32 %r82, %r49, -1; min.s32 %r174, %r6, %r82; bra.uni $L__BB0_22; $L__BB0_20: rem.s32 %r80, %r6, %r49; add.s32 %r81, %r80, %r49; rem.s32 %r174, %r81, %r49; $L__BB0_22: add.s32 %r83, %r173, %r4; mad.lo.s32 %r84, %r83, %r49, %r174; mul.wide.s32 %rd30, %r84, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f66, [%rd31]; ld.global.nc.f32 %f67, [%rd32]; mul.f32 %f68, %f67, %f67; fma.rn.f32 %f69, %f66, %f66, %f68; ld.global.nc.f32 %f70, [%rd33]; fma.rn.f32 %f16, %f70, %f70, %f69; mul.f32 %f71, %f6, %f8; mul.f32 %f72, %f5, %f9; sub.f32 %f73, %f72, %f71; mul.f32 %f74, %f4, %f9; mul.f32 %f75, %f6, %f7; sub.f32 %f76, %f75, %f74; mul.f32 %f77, %f5, %f7; mul.f32 %f78, %f4, %f8; sub.f32 %f79, %f78, %f77; mul.f32 %f80, %f2, %f76; fma.rn.f32 %f81, %f1, %f73, %f80; fma.rn.f32 %f17, %f3, %f79, %f81; mul.f32 %f82, %f2, %f5; fma.rn.f32 %f83, %f1, %f4, %f82; fma.rn.f32 %f84, %f3, %f6, %f83; add.f32 %f85, %f84, 0f3F800000; mul.f32 %f86, %f2, %f8; fma.rn.f32 %f87, %f1, %f7, %f86; fma.rn.f32 %f88, %f3, %f9, %f87; add.f32 %f89, %f85, %f88; mul.f32 %f90, %f5, %f8; fma.rn.f32 %f91, %f4, %f7, %f90; fma.rn.f32 %f92, %f6, %f9, %f91; add.f32 %f18, %f92, %f89; abs.f32 %f19, %f18; abs.f32 %f20, %f17; setp.eq.f32 %p20, %f19, 0f00000000; setp.eq.f32 %p21, %f20, 0f00000000; and.pred %p22, %p20, %p21; @%p22 bra $L__BB0_26; bra.uni $L__BB0_23; $L__BB0_26: mov.b32 %r95, %f18; shr.s32 %r96, %r95, 31; and.b32 %r97, %r96, 1078530011; mov.b32 %r98, %f17; and.b32 %r99, %r98, -2147483648; or.b32 %r100, %r99, %r97; mov.b32 %f288, %r100; bra.uni $L__BB0_27; $L__BB0_23: setp.eq.f32 %p23, %f19, 0f7F800000; setp.eq.f32 %p24, %f20, 0f7F800000; and.pred %p25, %p23, %p24; @%p25 bra $L__BB0_25; bra.uni $L__BB0_24; $L__BB0_25: mov.b32 %r90, %f18; setp.lt.s32 %p29, %r90, 0; selp.b32 %r91, 1075235812, 1061752795, %p29; mov.b32 %r92, %f17; and.b32 %r93, %r92, -2147483648; or.b32 %r94, %r93, %r91; mov.b32 %f288, %r94; bra.uni $L__BB0_27; $L__BB0_24: max.f32 %f93, %f20, %f19; min.f32 %f94, %f20, %f19; div.rn.f32 %f95, %f94, %f93; mul.rn.f32 %f96, %f95, %f95; mov.f32 %f97, 0fC0B59883; mov.f32 %f98, 0fBF52C7EA; fma.rn.f32 %f99, %f96, %f98, %f97; mov.f32 %f100, 0fC0D21907; fma.rn.f32 %f101, %f99, %f96, %f100; mul.f32 %f102, %f96, %f101; mul.f32 %f103, %f95, %f102; add.f32 %f104, %f96, 0f41355DC0; mov.f32 %f105, 0f41E6BD60; fma.rn.f32 %f106, %f104, %f96, %f105; mov.f32 %f107, 0f419D92C8; fma.rn.f32 %f108, %f106, %f96, %f107; rcp.rn.f32 %f109, %f108; fma.rn.f32 %f110, %f103, %f109, %f95; mov.f32 %f111, 0f3FC90FDB; sub.f32 %f112, %f111, %f110; setp.gt.f32 %p26, %f20, %f19; selp.f32 %f113, %f112, %f110, %p26; mov.b32 %r85, %f18; setp.lt.s32 %p27, %r85, 0; mov.f32 %f114, 0f40490FDB; sub.f32 %f115, %f114, %f113; selp.f32 %f116, %f115, %f113, %p27; mov.b32 %r86, %f116; mov.b32 %r87, %f17; and.b32 %r88, %r87, -2147483648; or.b32 %r89, %r88, %r86; mov.b32 %f117, %r89; add.f32 %f118, %f19, %f20; setp.le.f32 %p28, %f118, 0f7F800000; selp.f32 %f288, %f117, %f118, %p28; $L__BB0_27: add.f32 %f119, %f288, %f288; setp.eq.f32 %p30, %f16, 0f00000000; selp.f32 %f120, 0f3F800000, 0f3F000000, %p30; fma.rn.f32 %f293, %f120, %f119, 0f00000000; $L__BB0_28: setp.lt.s32 %p31, %r1, 1; and.pred %p33, %p31, %p8; @%p33 bra $L__BB0_55; setp.ge.s32 %p34, %r10, %r50; and.pred %p36, %p34, %p9; @%p36 bra $L__BB0_42; @%p9 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r103, %r50, -1; min.s32 %r175, %r10, %r103; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r101, %r10, %r50; add.s32 %r102, %r101, %r50; rem.s32 %r175, %r102, %r50; $L__BB0_33: @%p8 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r176, %r14, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r104, %r14, %r49; add.s32 %r105, %r104, %r49; rem.s32 %r176, %r105, %r49; $L__BB0_36: add.s32 %r106, %r175, %r4; mad.lo.s32 %r107, %r106, %r49, %r176; mul.wide.s32 %rd34, %r107, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f121, [%rd35]; ld.global.nc.f32 %f122, [%rd36]; mul.f32 %f123, %f122, %f122; fma.rn.f32 %f124, %f121, %f121, %f123; ld.global.nc.f32 %f125, [%rd37]; fma.rn.f32 %f27, %f125, %f125, %f124; mul.f32 %f126, %f9, %f11; mul.f32 %f127, %f8, %f12; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f7, %f12; mul.f32 %f130, %f9, %f10; sub.f32 %f131, %f130, %f129; mul.f32 %f132, %f8, %f10; mul.f32 %f133, %f7, %f11; sub.f32 %f134, %f133, %f132; mul.f32 %f135, %f2, %f131; fma.rn.f32 %f136, %f1, %f128, %f135; fma.rn.f32 %f28, %f3, %f134, %f136; mul.f32 %f137, %f2, %f8; fma.rn.f32 %f138, %f1, %f7, %f137; fma.rn.f32 %f139, %f3, %f9, %f138; add.f32 %f140, %f139, 0f3F800000; mul.f32 %f141, %f2, %f11; fma.rn.f32 %f142, %f1, %f10, %f141; fma.rn.f32 %f143, %f3, %f12, %f142; add.f32 %f144, %f140, %f143; mul.f32 %f145, %f8, %f11; fma.rn.f32 %f146, %f7, %f10, %f145; fma.rn.f32 %f147, %f9, %f12, %f146; add.f32 %f29, %f147, %f144; abs.f32 %f30, %f29; abs.f32 %f31, %f28; setp.eq.f32 %p39, %f30, 0f00000000; setp.eq.f32 %p40, %f31, 0f00000000; and.pred %p41, %p39, %p40; @%p41 bra $L__BB0_40; bra.uni $L__BB0_37; $L__BB0_40: mov.b32 %r118, %f29; shr.s32 %r119, %r118, 31; and.b32 %r120, %r119, 1078530011; mov.b32 %r121, %f28; and.b32 %r122, %r121, -2147483648; or.b32 %r123, %r122, %r120; mov.b32 %f290, %r123; bra.uni $L__BB0_41; $L__BB0_37: setp.eq.f32 %p42, %f30, 0f7F800000; setp.eq.f32 %p43, %f31, 0f7F800000; and.pred %p44, %p42, %p43; @%p44 bra $L__BB0_39; bra.uni $L__BB0_38; $L__BB0_39: mov.b32 %r113, %f29; setp.lt.s32 %p48, %r113, 0; selp.b32 %r114, 1075235812, 1061752795, %p48; mov.b32 %r115, %f28; and.b32 %r116, %r115, -2147483648; or.b32 %r117, %r116, %r114; mov.b32 %f290, %r117; bra.uni $L__BB0_41; $L__BB0_38: max.f32 %f148, %f31, %f30; min.f32 %f149, %f31, %f30; div.rn.f32 %f150, %f149, %f148; mul.rn.f32 %f151, %f150, %f150; mov.f32 %f152, 0fC0B59883; mov.f32 %f153, 0fBF52C7EA; fma.rn.f32 %f154, %f151, %f153, %f152; mov.f32 %f155, 0fC0D21907; fma.rn.f32 %f156, %f154, %f151, %f155; mul.f32 %f157, %f151, %f156; mul.f32 %f158, %f150, %f157; add.f32 %f159, %f151, 0f41355DC0; mov.f32 %f160, 0f41E6BD60; fma.rn.f32 %f161, %f159, %f151, %f160; mov.f32 %f162, 0f419D92C8; fma.rn.f32 %f163, %f161, %f151, %f162; rcp.rn.f32 %f164, %f163; fma.rn.f32 %f165, %f158, %f164, %f150; mov.f32 %f166, 0f3FC90FDB; sub.f32 %f167, %f166, %f165; setp.gt.f32 %p45, %f31, %f30; selp.f32 %f168, %f167, %f165, %p45; mov.b32 %r108, %f29; setp.lt.s32 %p46, %r108, 0; mov.f32 %f169, 0f40490FDB; sub.f32 %f170, %f169, %f168; selp.f32 %f171, %f170, %f168, %p46; mov.b32 %r109, %f171; mov.b32 %r110, %f28; and.b32 %r111, %r110, -2147483648; or.b32 %r112, %r111, %r109; mov.b32 %f172, %r112; add.f32 %f173, %f30, %f31; setp.le.f32 %p47, %f173, 0f7F800000; selp.f32 %f290, %f172, %f173, %p47; $L__BB0_41: add.f32 %f174, %f290, %f290; setp.eq.f32 %p49, %f27, 0f00000000; selp.f32 %f175, 0f3F800000, 0f3F000000, %p49; fma.rn.f32 %f293, %f175, %f174, %f293; $L__BB0_42: setp.lt.s32 %p50, %r2, 1; and.pred %p52, %p50, %p9; @%p52 bra $L__BB0_55; @%p9 bra $L__BB0_45; bra.uni $L__BB0_44; $L__BB0_45: max.s32 %r177, %r19, 0; bra.uni $L__BB0_46; $L__BB0_44: rem.s32 %r124, %r19, %r50; add.s32 %r125, %r124, %r50; rem.s32 %r177, %r125, %r50; $L__BB0_46: @%p8 bra $L__BB0_48; bra.uni $L__BB0_47; $L__BB0_48: max.s32 %r178, %r14, 0; bra.uni $L__BB0_49; $L__BB0_47: rem.s32 %r126, %r14, %r49; add.s32 %r127, %r126, %r49; rem.s32 %r178, %r127, %r49; $L__BB0_49: add.s32 %r128, %r177, %r4; mad.lo.s32 %r129, %r128, %r49, %r178; mul.wide.s32 %rd38, %r129, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f176, [%rd39]; ld.global.nc.f32 %f177, [%rd40]; mul.f32 %f178, %f177, %f177; fma.rn.f32 %f179, %f176, %f176, %f178; ld.global.nc.f32 %f180, [%rd41]; fma.rn.f32 %f38, %f180, %f180, %f179; mul.f32 %f181, %f12, %f14; mul.f32 %f182, %f11, %f15; sub.f32 %f183, %f182, %f181; mul.f32 %f184, %f10, %f15; mul.f32 %f185, %f12, %f13; sub.f32 %f186, %f185, %f184; mul.f32 %f187, %f11, %f13; mul.f32 %f188, %f10, %f14; sub.f32 %f189, %f188, %f187; mul.f32 %f190, %f2, %f186; fma.rn.f32 %f191, %f1, %f183, %f190; fma.rn.f32 %f39, %f3, %f189, %f191; mul.f32 %f192, %f2, %f11; fma.rn.f32 %f193, %f1, %f10, %f192; fma.rn.f32 %f194, %f3, %f12, %f193; add.f32 %f195, %f194, 0f3F800000; mul.f32 %f196, %f2, %f14; fma.rn.f32 %f197, %f1, %f13, %f196; fma.rn.f32 %f198, %f3, %f15, %f197; add.f32 %f199, %f195, %f198; mul.f32 %f200, %f11, %f14; fma.rn.f32 %f201, %f10, %f13, %f200; fma.rn.f32 %f202, %f12, %f15, %f201; add.f32 %f40, %f202, %f199; abs.f32 %f41, %f40; abs.f32 %f42, %f39; setp.eq.f32 %p55, %f41, 0f00000000; setp.eq.f32 %p56, %f42, 0f00000000; and.pred %p57, %p55, %p56; @%p57 bra $L__BB0_53; bra.uni $L__BB0_50; $L__BB0_53: mov.b32 %r140, %f40; shr.s32 %r141, %r140, 31; and.b32 %r142, %r141, 1078530011; mov.b32 %r143, %f39; and.b32 %r144, %r143, -2147483648; or.b32 %r145, %r144, %r142; mov.b32 %f292, %r145; bra.uni $L__BB0_54; $L__BB0_50: setp.eq.f32 %p58, %f41, 0f7F800000; setp.eq.f32 %p59, %f42, 0f7F800000; and.pred %p60, %p58, %p59; @%p60 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: mov.b32 %r135, %f40; setp.lt.s32 %p64, %r135, 0; selp.b32 %r136, 1075235812, 1061752795, %p64; mov.b32 %r137, %f39; and.b32 %r138, %r137, -2147483648; or.b32 %r139, %r138, %r136; mov.b32 %f292, %r139; bra.uni $L__BB0_54; $L__BB0_51: max.f32 %f203, %f42, %f41; min.f32 %f204, %f42, %f41; div.rn.f32 %f205, %f204, %f203; mul.rn.f32 %f206, %f205, %f205; mov.f32 %f207, 0fC0B59883; mov.f32 %f208, 0fBF52C7EA; fma.rn.f32 %f209, %f206, %f208, %f207; mov.f32 %f210, 0fC0D21907; fma.rn.f32 %f211, %f209, %f206, %f210; mul.f32 %f212, %f206, %f211; mul.f32 %f213, %f205, %f212; add.f32 %f214, %f206, 0f41355DC0; mov.f32 %f215, 0f41E6BD60; fma.rn.f32 %f216, %f214, %f206, %f215; mov.f32 %f217, 0f419D92C8; fma.rn.f32 %f218, %f216, %f206, %f217; rcp.rn.f32 %f219, %f218; fma.rn.f32 %f220, %f213, %f219, %f205; mov.f32 %f221, 0f3FC90FDB; sub.f32 %f222, %f221, %f220; setp.gt.f32 %p61, %f42, %f41; selp.f32 %f223, %f222, %f220, %p61; mov.b32 %r130, %f40; setp.lt.s32 %p62, %r130, 0; mov.f32 %f224, 0f40490FDB; sub.f32 %f225, %f224, %f223; selp.f32 %f226, %f225, %f223, %p62; mov.b32 %r131, %f226; mov.b32 %r132, %f39; and.b32 %r133, %r132, -2147483648; or.b32 %r134, %r133, %r131; mov.b32 %f227, %r134; add.f32 %f228, %f41, %f42; setp.le.f32 %p63, %f228, 0f7F800000; selp.f32 %f292, %f227, %f228, %p63; $L__BB0_54: add.f32 %f229, %f292, %f292; setp.eq.f32 %p65, %f38, 0f00000000; selp.f32 %f230, 0f3F800000, 0f3F000000, %p65; fma.rn.f32 %f293, %f230, %f229, %f293; $L__BB0_55: @%p14 bra $L__BB0_69; setp.lt.s32 %p67, %r2, 1; and.pred %p69, %p67, %p9; @%p69 bra $L__BB0_69; @%p9 bra $L__BB0_59; bra.uni $L__BB0_58; $L__BB0_59: max.s32 %r179, %r19, 0; bra.uni $L__BB0_60; $L__BB0_58: rem.s32 %r146, %r19, %r50; add.s32 %r147, %r146, %r50; rem.s32 %r179, %r147, %r50; $L__BB0_60: add.s32 %r45, %r179, %r4; @%p8 bra $L__BB0_62; bra.uni $L__BB0_61; $L__BB0_62: add.s32 %r150, %r49, -1; min.s32 %r180, %r6, %r150; bra.uni $L__BB0_63; $L__BB0_61: rem.s32 %r148, %r6, %r49; add.s32 %r149, %r148, %r49; rem.s32 %r180, %r149, %r49; $L__BB0_63: mad.lo.s32 %r151, %r45, %r49, %r180; mul.wide.s32 %rd42, %r151, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f231, [%rd43]; ld.global.nc.f32 %f232, [%rd44]; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; ld.global.nc.f32 %f235, [%rd45]; fma.rn.f32 %f49, %f235, %f235, %f234; mul.f32 %f236, %f5, %f15; mul.f32 %f237, %f6, %f14; sub.f32 %f238, %f237, %f236; mul.f32 %f239, %f6, %f13; mul.f32 %f240, %f4, %f15; sub.f32 %f241, %f240, %f239; mul.f32 %f242, %f4, %f14; mul.f32 %f243, %f5, %f13; sub.f32 %f244, %f243, %f242; mul.f32 %f245, %f2, %f241; fma.rn.f32 %f246, %f1, %f238, %f245; fma.rn.f32 %f50, %f3, %f244, %f246; mul.f32 %f247, %f2, %f14; fma.rn.f32 %f248, %f1, %f13, %f247; fma.rn.f32 %f249, %f3, %f15, %f248; add.f32 %f250, %f249, 0f3F800000; mul.f32 %f251, %f2, %f5; fma.rn.f32 %f252, %f1, %f4, %f251; fma.rn.f32 %f253, %f3, %f6, %f252; add.f32 %f254, %f253, %f250; mul.f32 %f255, %f5, %f14; fma.rn.f32 %f256, %f4, %f13, %f255; fma.rn.f32 %f257, %f6, %f15, %f256; add.f32 %f51, %f257, %f254; abs.f32 %f52, %f51; abs.f32 %f53, %f50; setp.eq.f32 %p72, %f52, 0f00000000; setp.eq.f32 %p73, %f53, 0f00000000; and.pred %p74, %p72, %p73; @%p74 bra $L__BB0_67; bra.uni $L__BB0_64; $L__BB0_67: mov.b32 %r162, %f51; shr.s32 %r163, %r162, 31; and.b32 %r164, %r163, 1078530011; mov.b32 %r165, %f50; and.b32 %r166, %r165, -2147483648; or.b32 %r167, %r164, %r166; mov.b32 %f294, %r167; bra.uni $L__BB0_68; $L__BB0_64: setp.eq.f32 %p75, %f52, 0f7F800000; setp.eq.f32 %p76, %f53, 0f7F800000; and.pred %p77, %p75, %p76; @%p77 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: mov.b32 %r157, %f51; setp.lt.s32 %p81, %r157, 0; selp.b32 %r158, 1075235812, 1061752795, %p81; mov.b32 %r159, %f50; and.b32 %r160, %r159, -2147483648; or.b32 %r161, %r158, %r160; mov.b32 %f294, %r161; bra.uni $L__BB0_68; $L__BB0_65: max.f32 %f258, %f53, %f52; min.f32 %f259, %f53, %f52; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p78, %f53, %f52; selp.f32 %f278, %f277, %f275, %p78; mov.b32 %r152, %f51; setp.lt.s32 %p79, %r152, 0; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; selp.f32 %f281, %f280, %f278, %p79; mov.b32 %r153, %f281; mov.b32 %r154, %f50; and.b32 %r155, %r154, -2147483648; or.b32 %r156, %r155, %r153; mov.b32 %f282, %r156; add.f32 %f283, %f52, %f53; setp.le.f32 %p80, %f283, 0f7F800000; selp.f32 %f294, %f282, %f283, %p80; $L__BB0_68: add.f32 %f284, %f294, %f294; setp.eq.f32 %p82, %f49, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f293, %f285, %f284, %f293; $L__BB0_69: mul.f32 %f286, %f293, %f60; st.global.f32 [%rd4], %f286; $L__BB0_71: ret; } ` ) 3-3.11.1/cuda/uniaxialanisotropy2.cu000066400000000000000000000025431503346766200172500ustar00rootroot00000000000000#include #include "float3.h" #include "amul.h" // Add uniaxial magnetocrystalline anisotropy field to B. // http://www.southampton.ac.uk/~fangohr/software/oxs_uniaxial4.html extern "C" __global__ void adduniaxialanisotropy2(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ K1_, float K1_mul, float* __restrict__ K2_, float K2_mul, float* __restrict__ ux_, float ux_mul, float* __restrict__ uy_, float uy_mul, float* __restrict__ uz_, float uz_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 u = normalized(vmul(ux_, uy_, uz_, ux_mul, uy_mul, uz_mul, i)); float invMs = inv_Msat(Ms_, Ms_mul, i); float K1 = amul(K1_, K1_mul, i) * invMs; float K2 = amul(K2_, K2_mul, i) * invMs; float3 m = {mx[i], my[i], mz[i]}; float mu = dot(m, u); float3 Ba = 2.0f*K1* (mu)*u+ 4.0f*K2*pow3(mu)*u; Bx[i] += Ba.x; By[i] += Ba.y; Bz[i] += Ba.z; } } 3-3.11.1/cuda/uniaxialanisotropy2_wrapper.go000066400000000000000000002423431503346766200210120ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for adduniaxialanisotropy2 kernel var adduniaxialanisotropy2_code cu.Function // Stores the arguments for adduniaxialanisotropy2 kernel invocation type adduniaxialanisotropy2_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_K1_ unsafe.Pointer arg_K1_mul float32 arg_K2_ unsafe.Pointer arg_K2_mul float32 arg_ux_ unsafe.Pointer arg_ux_mul float32 arg_uy_ unsafe.Pointer arg_uy_mul float32 arg_uz_ unsafe.Pointer arg_uz_mul float32 arg_N int argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adduniaxialanisotropy2 kernel invocation var adduniaxialanisotropy2_args adduniaxialanisotropy2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. adduniaxialanisotropy2_args.argptr[0] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bx) adduniaxialanisotropy2_args.argptr[1] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_By) adduniaxialanisotropy2_args.argptr[2] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bz) adduniaxialanisotropy2_args.argptr[3] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mx) adduniaxialanisotropy2_args.argptr[4] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_my) adduniaxialanisotropy2_args.argptr[5] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mz) adduniaxialanisotropy2_args.argptr[6] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_) adduniaxialanisotropy2_args.argptr[7] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_mul) adduniaxialanisotropy2_args.argptr[8] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_) adduniaxialanisotropy2_args.argptr[9] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_mul) adduniaxialanisotropy2_args.argptr[10] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_) adduniaxialanisotropy2_args.argptr[11] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_mul) adduniaxialanisotropy2_args.argptr[12] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_) adduniaxialanisotropy2_args.argptr[13] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_mul) adduniaxialanisotropy2_args.argptr[14] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_) adduniaxialanisotropy2_args.argptr[15] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_mul) adduniaxialanisotropy2_args.argptr[16] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_) adduniaxialanisotropy2_args.argptr[17] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_mul) adduniaxialanisotropy2_args.argptr[18] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_N) } // Wrapper for adduniaxialanisotropy2 CUDA kernel, asynchronous. func k_adduniaxialanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, K1_ unsafe.Pointer, K1_mul float32, K2_ unsafe.Pointer, K2_mul float32, ux_ unsafe.Pointer, ux_mul float32, uy_ unsafe.Pointer, uy_mul float32, uz_ unsafe.Pointer, uz_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("adduniaxialanisotropy2") } adduniaxialanisotropy2_args.Lock() defer adduniaxialanisotropy2_args.Unlock() if adduniaxialanisotropy2_code == 0 { adduniaxialanisotropy2_code = fatbinLoad(adduniaxialanisotropy2_map, "adduniaxialanisotropy2") } adduniaxialanisotropy2_args.arg_Bx = Bx adduniaxialanisotropy2_args.arg_By = By adduniaxialanisotropy2_args.arg_Bz = Bz adduniaxialanisotropy2_args.arg_mx = mx adduniaxialanisotropy2_args.arg_my = my adduniaxialanisotropy2_args.arg_mz = mz adduniaxialanisotropy2_args.arg_Ms_ = Ms_ adduniaxialanisotropy2_args.arg_Ms_mul = Ms_mul adduniaxialanisotropy2_args.arg_K1_ = K1_ adduniaxialanisotropy2_args.arg_K1_mul = K1_mul adduniaxialanisotropy2_args.arg_K2_ = K2_ adduniaxialanisotropy2_args.arg_K2_mul = K2_mul adduniaxialanisotropy2_args.arg_ux_ = ux_ adduniaxialanisotropy2_args.arg_ux_mul = ux_mul adduniaxialanisotropy2_args.arg_uy_ = uy_ adduniaxialanisotropy2_args.arg_uy_mul = uy_mul adduniaxialanisotropy2_args.arg_uz_ = uz_ adduniaxialanisotropy2_args.arg_uz_mul = uz_mul adduniaxialanisotropy2_args.arg_N = N args := adduniaxialanisotropy2_args.argptr[:] cu.LaunchKernel(adduniaxialanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adduniaxialanisotropy2") } } // maps compute capability on PTX code for adduniaxialanisotropy2 kernel. var adduniaxialanisotropy2_map = map[int]string{0: "", 50: adduniaxialanisotropy2_ptx_50, 52: adduniaxialanisotropy2_ptx_52, 53: adduniaxialanisotropy2_ptx_53, 60: adduniaxialanisotropy2_ptx_60, 61: adduniaxialanisotropy2_ptx_61, 62: adduniaxialanisotropy2_ptx_62, 70: adduniaxialanisotropy2_ptx_70, 72: adduniaxialanisotropy2_ptx_72, 75: adduniaxialanisotropy2_ptx_75, 80: adduniaxialanisotropy2_ptx_80, 86: adduniaxialanisotropy2_ptx_86, 87: adduniaxialanisotropy2_ptx_87, 89: adduniaxialanisotropy2_ptx_89, 90: adduniaxialanisotropy2_ptx_90} // adduniaxialanisotropy2 PTX code for various compute capabilities. const ( adduniaxialanisotropy2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` adduniaxialanisotropy2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; $L__BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra $L__BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; $L__BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra $L__BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; $L__BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; setp.eq.f32 %p5, %f7, 0f00000000; mov.f32 %f67, 0f00000000; @%p5 bra $L__BB0_9; rcp.rn.f32 %f67, %f7; $L__BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra $L__BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; $L__BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra $L__BB0_13; rcp.rn.f32 %f69, %f68; $L__BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra $L__BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; $L__BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra $L__BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; $L__BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; $L__BB0_18: ret; } ` ) 3-3.11.1/cuda/util.go000066400000000000000000000024471503346766200142020ustar00rootroot00000000000000package cuda import ( "fmt" "github.com/mumax/3/cuda/cu" ) // CUDA Launch parameters. // there might be better choices for recent hardware, // but it barely makes a difference in the end. const ( BlockSize = 512 TileX, TileY = 32, 32 MaxGridSize = 65535 ) // cuda launch configuration type config struct { Grid, Block cu.Dim3 } // Make a 1D kernel launch configuration suited for N threads. func make1DConf(N int) *config { bl := cu.Dim3{X: BlockSize, Y: 1, Z: 1} n2 := divUp(N, BlockSize) // N2 blocks left nx := divUp(n2, MaxGridSize) ny := divUp(n2, nx) gr := cu.Dim3{X: nx, Y: ny, Z: 1} return &config{gr, bl} } // Make a 3D kernel launch configuration suited for N threads. func make3DConf(N [3]int) *config { bl := cu.Dim3{X: TileX, Y: TileY, Z: 1} nx := divUp(N[X], TileX) ny := divUp(N[Y], TileY) gr := cu.Dim3{X: nx, Y: ny, Z: N[Z]} return &config{gr, bl} } // integer minimum func iMin(a, b int) int { if a < b { return a } return b } // Integer division rounded up. func divUp(x, y int) int { return ((x - 1) / y) + 1 } const ( X = 0 Y = 1 Z = 2 ) func checkSize(a interface { Size() [3]int }, b ...interface { Size() [3]int }) { sa := a.Size() for _, b := range b { if b.Size() != sa { panic(fmt.Sprintf("size mismatch: %v != %v", sa, b.Size())) } } } 3-3.11.1/cuda/zeromask.cu000066400000000000000000000005611503346766200150550ustar00rootroot00000000000000#include #include "float3.h" // set dst to zero in cells where mask != 0 extern "C" __global__ void zeromask(float* __restrict__ dst, float* maskLUT, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { if (maskLUT[regions[i]] != 0) { dst[i] = 0; } } } 3-3.11.1/cuda/zeromask.go000066400000000000000000000011251503346766200150500ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" ) // Sets vector dst to zero where mask != 0. func ZeroMask(dst *data.Slice, mask LUTPtr, regions *Bytes) { N := dst.Len() cfg := make1DConf(N) for c := 0; c < dst.NComp(); c++ { k_zeromask_async(dst.DevPtr(c), unsafe.Pointer(mask), regions.Ptr, N, cfg) } } // Sets vector dst to zero where mask == 0. func ZeroMaskInv(dst *data.Slice, mask LUTPtr, regions *Bytes) { N := dst.Len() cfg := make1DConf(N) for c := 0; c < dst.NComp(); c++ { k_zeromaskinv_async(dst.DevPtr(c), unsafe.Pointer(mask), regions.Ptr, N, cfg) } } 3-3.11.1/cuda/zeromask_wrapper.go000066400000000000000000000465261503346766200166260ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for zeromask kernel var zeromask_code cu.Function // Stores the arguments for zeromask kernel invocation type zeromask_args_t struct { arg_dst unsafe.Pointer arg_maskLUT unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for zeromask kernel invocation var zeromask_args zeromask_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. zeromask_args.argptr[0] = unsafe.Pointer(&zeromask_args.arg_dst) zeromask_args.argptr[1] = unsafe.Pointer(&zeromask_args.arg_maskLUT) zeromask_args.argptr[2] = unsafe.Pointer(&zeromask_args.arg_regions) zeromask_args.argptr[3] = unsafe.Pointer(&zeromask_args.arg_N) } // Wrapper for zeromask CUDA kernel, asynchronous. func k_zeromask_async(dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("zeromask") } zeromask_args.Lock() defer zeromask_args.Unlock() if zeromask_code == 0 { zeromask_code = fatbinLoad(zeromask_map, "zeromask") } zeromask_args.arg_dst = dst zeromask_args.arg_maskLUT = maskLUT zeromask_args.arg_regions = regions zeromask_args.arg_N = N args := zeromask_args.argptr[:] cu.LaunchKernel(zeromask_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("zeromask") } } // maps compute capability on PTX code for zeromask kernel. var zeromask_map = map[int]string{0: "", 50: zeromask_ptx_50, 52: zeromask_ptx_52, 53: zeromask_ptx_53, 60: zeromask_ptx_60, 61: zeromask_ptx_61, 62: zeromask_ptx_62, 70: zeromask_ptx_70, 72: zeromask_ptx_72, 75: zeromask_ptx_75, 80: zeromask_ptx_80, 86: zeromask_ptx_86, 87: zeromask_ptx_87, 89: zeromask_ptx_89, 90: zeromask_ptx_90} // zeromask PTX code for various compute capabilities. const ( zeromask_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromask_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` ) 3-3.11.1/cuda/zeromaskinv.cu000066400000000000000000000005641503346766200155750ustar00rootroot00000000000000#include #include "float3.h" // set dst to zero in cells where mask != 0 extern "C" __global__ void zeromaskinv(float* __restrict__ dst, float* maskLUT, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { if (maskLUT[regions[i]] == 0) { dst[i] = 0; } } } 3-3.11.1/cuda/zeromaskinv_wrapper.go000066400000000000000000000477021503346766200173400ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for zeromaskinv kernel var zeromaskinv_code cu.Function // Stores the arguments for zeromaskinv kernel invocation type zeromaskinv_args_t struct { arg_dst unsafe.Pointer arg_maskLUT unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for zeromaskinv kernel invocation var zeromaskinv_args zeromaskinv_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. zeromaskinv_args.argptr[0] = unsafe.Pointer(&zeromaskinv_args.arg_dst) zeromaskinv_args.argptr[1] = unsafe.Pointer(&zeromaskinv_args.arg_maskLUT) zeromaskinv_args.argptr[2] = unsafe.Pointer(&zeromaskinv_args.arg_regions) zeromaskinv_args.argptr[3] = unsafe.Pointer(&zeromaskinv_args.arg_N) } // Wrapper for zeromaskinv CUDA kernel, asynchronous. func k_zeromaskinv_async(dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("zeromaskinv") } zeromaskinv_args.Lock() defer zeromaskinv_args.Unlock() if zeromaskinv_code == 0 { zeromaskinv_code = fatbinLoad(zeromaskinv_map, "zeromaskinv") } zeromaskinv_args.arg_dst = dst zeromaskinv_args.arg_maskLUT = maskLUT zeromaskinv_args.arg_regions = regions zeromaskinv_args.arg_N = N args := zeromaskinv_args.argptr[:] cu.LaunchKernel(zeromaskinv_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("zeromaskinv") } } // maps compute capability on PTX code for zeromaskinv kernel. var zeromaskinv_map = map[int]string{0: "", 50: zeromaskinv_ptx_50, 52: zeromaskinv_ptx_52, 53: zeromaskinv_ptx_53, 60: zeromaskinv_ptx_60, 61: zeromaskinv_ptx_61, 62: zeromaskinv_ptx_62, 70: zeromaskinv_ptx_70, 72: zeromaskinv_ptx_72, 75: zeromaskinv_ptx_75, 80: zeromaskinv_ptx_80, 86: zeromaskinv_ptx_86, 87: zeromaskinv_ptx_87, 89: zeromaskinv_ptx_89, 90: zeromaskinv_ptx_90} // zeromaskinv PTX code for various compute capabilities. const ( zeromaskinv_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` zeromaskinv_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl zeromaskinv .visible .entry zeromaskinv( .param .u64 zeromaskinv_param_0, .param .u64 zeromaskinv_param_1, .param .u64 zeromaskinv_param_2, .param .u32 zeromaskinv_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromaskinv_param_0]; ld.param.u64 %rd3, [zeromaskinv_param_1]; ld.param.u64 %rd4, [zeromaskinv_param_2]; ld.param.u32 %r2, [zeromaskinv_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.neu.f32 %p2, %f1, 0f00000000; @%p2 bra $L__BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; $L__BB0_3: ret; } ` ) 3-3.11.1/cuda/zhangli.go000066400000000000000000000012571503346766200146570ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // Add Zhang-Li ST torque (Tesla) to torque. // see zhangli2.cu func AddZhangLiTorque(torque, m *data.Slice, Msat, J, alpha, xi, pol MSlice, mesh *data.Mesh) { c := mesh.CellSize() N := mesh.Size() cfg := make3DConf(N) k_addzhanglitorque2_async( torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), J.DevPtr(X), J.Mul(X), J.DevPtr(Y), J.Mul(Y), J.DevPtr(Z), J.Mul(Z), alpha.DevPtr(0), alpha.Mul(0), xi.DevPtr(0), xi.Mul(0), pol.DevPtr(0), pol.Mul(0), float32(c[X]), float32(c[Y]), float32(c[Z]), N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } 3-3.11.1/cuda/zhangli2.cu000066400000000000000000000046401503346766200147420ustar00rootroot00000000000000#include "amul.h" #include "constants.h" #include "float3.h" #include "stencil.h" #include #define PREFACTOR ((MUB) / (2 * QE * GAMMA0)) // spatial derivatives without dividing by cell size #define deltax(in) (in[idx(hclampx(ix+1), iy, iz)] - in[idx(lclampx(ix-1), iy, iz)]) #define deltay(in) (in[idx(ix, hclampy(iy+1), iz)] - in[idx(ix, lclampy(iy-1), iz)]) #define deltaz(in) (in[idx(ix, iy, hclampz(iz+1))] - in[idx(ix, iy, lclampz(iz-1))]) extern "C" __global__ void addzhanglitorque2(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ jx_, float jx_mul, float* __restrict__ jy_, float jy_mul, float* __restrict__ jz_, float jz_mul, float* __restrict__ alpha_, float alpha_mul, float* __restrict__ xi_, float xi_mul, float* __restrict__ pol_, float pol_mul, float cx, float cy, float cz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int i = idx(ix, iy, iz); float alpha = amul(alpha_, alpha_mul, i); float xi = amul(xi_, xi_mul, i); float pol = amul(pol_, pol_mul, i); float invMs = inv_Msat(Ms_, Ms_mul, i); float b = invMs * PREFACTOR / (1.0f + xi*xi); float3 J = pol*vmul(jx_, jy_, jz_, jx_mul, jy_mul, jz_mul, i); float3 hspin = make_float3(0.0f, 0.0f, 0.0f); // (u·∇)m if (J.x != 0.0f) { hspin += (b/cx)*J.x * make_float3(deltax(mx), deltax(my), deltax(mz)); } if (J.y != 0.0f) { hspin += (b/cy)*J.y * make_float3(deltay(mx), deltay(my), deltay(mz)); } if (J.z != 0.0f) { hspin += (b/cz)*J.z * make_float3(deltaz(mx), deltaz(my), deltaz(mz)); } float3 m = make_float3(mx[i], my[i], mz[i]); float3 torque = (-1.0f/(1.0f + alpha*alpha)) * ( (1.0f+xi*alpha) * cross(m, cross(m, hspin)) +( xi-alpha) * cross(m, hspin) ); // write back, adding to torque tx[i] += torque.x; ty[i] += torque.y; tz[i] += torque.z; } 3-3.11.1/cuda/zhangli2_wrapper.go000066400000000000000000006656001503346766200165110ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addzhanglitorque2 kernel var addzhanglitorque2_code cu.Function // Stores the arguments for addzhanglitorque2 kernel invocation type addzhanglitorque2_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_jx_ unsafe.Pointer arg_jx_mul float32 arg_jy_ unsafe.Pointer arg_jy_mul float32 arg_jz_ unsafe.Pointer arg_jz_mul float32 arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_xi_ unsafe.Pointer arg_xi_mul float32 arg_pol_ unsafe.Pointer arg_pol_mul float32 arg_cx float32 arg_cy float32 arg_cz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [27]unsafe.Pointer sync.Mutex } // Stores the arguments for addzhanglitorque2 kernel invocation var addzhanglitorque2_args addzhanglitorque2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addzhanglitorque2_args.argptr[0] = unsafe.Pointer(&addzhanglitorque2_args.arg_tx) addzhanglitorque2_args.argptr[1] = unsafe.Pointer(&addzhanglitorque2_args.arg_ty) addzhanglitorque2_args.argptr[2] = unsafe.Pointer(&addzhanglitorque2_args.arg_tz) addzhanglitorque2_args.argptr[3] = unsafe.Pointer(&addzhanglitorque2_args.arg_mx) addzhanglitorque2_args.argptr[4] = unsafe.Pointer(&addzhanglitorque2_args.arg_my) addzhanglitorque2_args.argptr[5] = unsafe.Pointer(&addzhanglitorque2_args.arg_mz) addzhanglitorque2_args.argptr[6] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_) addzhanglitorque2_args.argptr[7] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_mul) addzhanglitorque2_args.argptr[8] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_) addzhanglitorque2_args.argptr[9] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_mul) addzhanglitorque2_args.argptr[10] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_) addzhanglitorque2_args.argptr[11] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_mul) addzhanglitorque2_args.argptr[12] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_) addzhanglitorque2_args.argptr[13] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_mul) addzhanglitorque2_args.argptr[14] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_) addzhanglitorque2_args.argptr[15] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_mul) addzhanglitorque2_args.argptr[16] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_) addzhanglitorque2_args.argptr[17] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_mul) addzhanglitorque2_args.argptr[18] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_) addzhanglitorque2_args.argptr[19] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_mul) addzhanglitorque2_args.argptr[20] = unsafe.Pointer(&addzhanglitorque2_args.arg_cx) addzhanglitorque2_args.argptr[21] = unsafe.Pointer(&addzhanglitorque2_args.arg_cy) addzhanglitorque2_args.argptr[22] = unsafe.Pointer(&addzhanglitorque2_args.arg_cz) addzhanglitorque2_args.argptr[23] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nx) addzhanglitorque2_args.argptr[24] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ny) addzhanglitorque2_args.argptr[25] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nz) addzhanglitorque2_args.argptr[26] = unsafe.Pointer(&addzhanglitorque2_args.arg_PBC) } // Wrapper for addzhanglitorque2 CUDA kernel, asynchronous. func k_addzhanglitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jx_ unsafe.Pointer, jx_mul float32, jy_ unsafe.Pointer, jy_mul float32, jz_ unsafe.Pointer, jz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, xi_ unsafe.Pointer, xi_mul float32, pol_ unsafe.Pointer, pol_mul float32, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("addzhanglitorque2") } addzhanglitorque2_args.Lock() defer addzhanglitorque2_args.Unlock() if addzhanglitorque2_code == 0 { addzhanglitorque2_code = fatbinLoad(addzhanglitorque2_map, "addzhanglitorque2") } addzhanglitorque2_args.arg_tx = tx addzhanglitorque2_args.arg_ty = ty addzhanglitorque2_args.arg_tz = tz addzhanglitorque2_args.arg_mx = mx addzhanglitorque2_args.arg_my = my addzhanglitorque2_args.arg_mz = mz addzhanglitorque2_args.arg_Ms_ = Ms_ addzhanglitorque2_args.arg_Ms_mul = Ms_mul addzhanglitorque2_args.arg_jx_ = jx_ addzhanglitorque2_args.arg_jx_mul = jx_mul addzhanglitorque2_args.arg_jy_ = jy_ addzhanglitorque2_args.arg_jy_mul = jy_mul addzhanglitorque2_args.arg_jz_ = jz_ addzhanglitorque2_args.arg_jz_mul = jz_mul addzhanglitorque2_args.arg_alpha_ = alpha_ addzhanglitorque2_args.arg_alpha_mul = alpha_mul addzhanglitorque2_args.arg_xi_ = xi_ addzhanglitorque2_args.arg_xi_mul = xi_mul addzhanglitorque2_args.arg_pol_ = pol_ addzhanglitorque2_args.arg_pol_mul = pol_mul addzhanglitorque2_args.arg_cx = cx addzhanglitorque2_args.arg_cy = cy addzhanglitorque2_args.arg_cz = cz addzhanglitorque2_args.arg_Nx = Nx addzhanglitorque2_args.arg_Ny = Ny addzhanglitorque2_args.arg_Nz = Nz addzhanglitorque2_args.arg_PBC = PBC args := addzhanglitorque2_args.argptr[:] cu.LaunchKernel(addzhanglitorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addzhanglitorque2") } } // maps compute capability on PTX code for addzhanglitorque2 kernel. var addzhanglitorque2_map = map[int]string{0: "", 50: addzhanglitorque2_ptx_50, 52: addzhanglitorque2_ptx_52, 53: addzhanglitorque2_ptx_53, 60: addzhanglitorque2_ptx_60, 61: addzhanglitorque2_ptx_61, 62: addzhanglitorque2_ptx_62, 70: addzhanglitorque2_ptx_70, 72: addzhanglitorque2_ptx_72, 75: addzhanglitorque2_ptx_75, 80: addzhanglitorque2_ptx_80, 86: addzhanglitorque2_ptx_86, 87: addzhanglitorque2_ptx_87, 89: addzhanglitorque2_ptx_89, 90: addzhanglitorque2_ptx_90} // addzhanglitorque2 PTX code for various compute capabilities. const ( addzhanglitorque2_ptx_50 = ` .version 8.5 .target sm_50 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_52 = ` .version 8.5 .target sm_52 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_53 = ` .version 8.5 .target sm_53 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_60 = ` .version 8.5 .target sm_60 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_61 = ` .version 8.5 .target sm_61 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_62 = ` .version 8.5 .target sm_62 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_70 = ` .version 8.5 .target sm_70 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_72 = ` .version 8.5 .target sm_72 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_75 = ` .version 8.5 .target sm_75 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_80 = ` .version 8.5 .target sm_80 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_86 = ` .version 8.5 .target sm_86 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_87 = ` .version 8.5 .target sm_87 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_89 = ` .version 8.5 .target sm_89 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` addzhanglitorque2_ptx_90 = ` .version 8.5 .target sm_90 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<5>; .reg .f32 %f<149>; .reg .b32 %r<173>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r71, %r70, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r74, %r73, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r77, %r76, %r78; setp.ge.s32 %p1, %r1, %r67; setp.ge.s32 %p2, %r2, %r68; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra $L__BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; $L__BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra $L__BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; $L__BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra $L__BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; $L__BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra $L__BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; $L__BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra $L__BB0_11; rcp.rn.f32 %f136, %f135; $L__BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; $L__BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra $L__BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; $L__BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra $L__BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; $L__BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f143, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra $L__BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: add.s32 %r82, %r67, -1; min.s32 %r155, %r7, %r82; bra.uni $L__BB0_21; $L__BB0_19: rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r155, %r81, %r67; $L__BB0_21: add.s32 %r83, %r155, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: max.s32 %r156, %r11, 0; bra.uni $L__BB0_24; $L__BB0_22: rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r156, %r85, %r67; $L__BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f79, %f20; add.s32 %r86, %r156, %r5; mul.wide.s32 %rd40, %r86, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: add.s32 %r89, %r67, -1; min.s32 %r157, %r7, %r89; bra.uni $L__BB0_27; $L__BB0_25: rem.s32 %r87, %r7, %r67; add.s32 %r88, %r87, %r67; rem.s32 %r157, %r88, %r67; $L__BB0_27: add.s32 %r90, %r157, %r5; mul.wide.s32 %rd42, %r90, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: max.s32 %r158, %r11, 0; bra.uni $L__BB0_30; $L__BB0_28: rem.s32 %r91, %r11, %r67; add.s32 %r92, %r91, %r67; rem.s32 %r158, %r92, %r67; $L__BB0_30: add.s32 %r93, %r158, %r5; mul.wide.s32 %rd44, %r93, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra $L__BB0_32; bra.uni $L__BB0_31; $L__BB0_32: add.s32 %r96, %r67, -1; min.s32 %r159, %r7, %r96; bra.uni $L__BB0_33; $L__BB0_31: rem.s32 %r94, %r7, %r67; add.s32 %r95, %r94, %r67; rem.s32 %r159, %r95, %r67; $L__BB0_33: add.s32 %r97, %r159, %r5; mul.wide.s32 %rd46, %r97, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra $L__BB0_35; bra.uni $L__BB0_34; $L__BB0_35: max.s32 %r160, %r11, 0; bra.uni $L__BB0_36; $L__BB0_34: rem.s32 %r98, %r11, %r67; add.s32 %r99, %r98, %r67; rem.s32 %r160, %r99, %r67; $L__BB0_36: add.s32 %r100, %r160, %r5; mul.wide.s32 %rd48, %r100, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; $L__BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra $L__BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra $L__BB0_40; bra.uni $L__BB0_39; $L__BB0_40: add.s32 %r103, %r68, -1; min.s32 %r161, %r27, %r103; bra.uni $L__BB0_41; $L__BB0_39: rem.s32 %r101, %r27, %r68; add.s32 %r102, %r101, %r68; rem.s32 %r161, %r102, %r68; $L__BB0_41: add.s32 %r104, %r161, %r4; mad.lo.s32 %r105, %r104, %r67, %r1; mul.wide.s32 %rd50, %r105, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra $L__BB0_43; bra.uni $L__BB0_42; $L__BB0_43: max.s32 %r162, %r31, 0; bra.uni $L__BB0_44; $L__BB0_42: rem.s32 %r106, %r31, %r68; add.s32 %r107, %r106, %r68; rem.s32 %r162, %r107, %r68; $L__BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f84, %f18; add.s32 %r108, %r162, %r4; mad.lo.s32 %r109, %r108, %r67, %r1; mul.wide.s32 %rd52, %r109, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra $L__BB0_46; bra.uni $L__BB0_45; $L__BB0_46: add.s32 %r112, %r68, -1; min.s32 %r163, %r27, %r112; bra.uni $L__BB0_47; $L__BB0_45: rem.s32 %r110, %r27, %r68; add.s32 %r111, %r110, %r68; rem.s32 %r163, %r111, %r68; $L__BB0_47: add.s32 %r113, %r163, %r4; mad.lo.s32 %r114, %r113, %r67, %r1; mul.wide.s32 %rd54, %r114, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra $L__BB0_49; bra.uni $L__BB0_48; $L__BB0_49: max.s32 %r164, %r31, 0; bra.uni $L__BB0_50; $L__BB0_48: rem.s32 %r115, %r31, %r68; add.s32 %r116, %r115, %r68; rem.s32 %r164, %r116, %r68; $L__BB0_50: add.s32 %r117, %r164, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra $L__BB0_52; bra.uni $L__BB0_51; $L__BB0_52: add.s32 %r121, %r68, -1; min.s32 %r165, %r27, %r121; bra.uni $L__BB0_53; $L__BB0_51: rem.s32 %r119, %r27, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r165, %r120, %r68; $L__BB0_53: add.s32 %r122, %r165, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd58, %r123, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra $L__BB0_55; bra.uni $L__BB0_54; $L__BB0_55: max.s32 %r166, %r31, 0; bra.uni $L__BB0_56; $L__BB0_54: rem.s32 %r124, %r31, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r166, %r125, %r68; $L__BB0_56: add.s32 %r126, %r166, %r4; mad.lo.s32 %r127, %r126, %r67, %r1; mul.wide.s32 %rd60, %r127, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; $L__BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra $L__BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f89, %f19; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra $L__BB0_60; bra.uni $L__BB0_59; $L__BB0_60: add.s32 %r130, %r69, -1; min.s32 %r167, %r47, %r130; bra.uni $L__BB0_61; $L__BB0_59: rem.s32 %r128, %r47, %r69; add.s32 %r129, %r128, %r69; rem.s32 %r167, %r129, %r69; $L__BB0_61: mad.lo.s32 %r131, %r167, %r68, %r2; mad.lo.s32 %r132, %r131, %r67, %r1; mul.wide.s32 %rd62, %r132, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra $L__BB0_63; bra.uni $L__BB0_62; $L__BB0_63: max.s32 %r168, %r51, 0; bra.uni $L__BB0_64; $L__BB0_62: rem.s32 %r133, %r51, %r69; add.s32 %r134, %r133, %r69; rem.s32 %r168, %r134, %r69; $L__BB0_64: mad.lo.s32 %r135, %r168, %r68, %r2; mad.lo.s32 %r136, %r135, %r67, %r1; mul.wide.s32 %rd64, %r136, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra $L__BB0_66; bra.uni $L__BB0_65; $L__BB0_66: add.s32 %r139, %r69, -1; min.s32 %r169, %r47, %r139; bra.uni $L__BB0_67; $L__BB0_65: rem.s32 %r137, %r47, %r69; add.s32 %r138, %r137, %r69; rem.s32 %r169, %r138, %r69; $L__BB0_67: mad.lo.s32 %r140, %r169, %r68, %r2; mad.lo.s32 %r141, %r140, %r67, %r1; mul.wide.s32 %rd66, %r141, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra $L__BB0_69; bra.uni $L__BB0_68; $L__BB0_69: max.s32 %r170, %r51, 0; bra.uni $L__BB0_70; $L__BB0_68: rem.s32 %r142, %r51, %r69; add.s32 %r143, %r142, %r69; rem.s32 %r170, %r143, %r69; $L__BB0_70: mad.lo.s32 %r144, %r170, %r68, %r2; mad.lo.s32 %r145, %r144, %r67, %r1; mul.wide.s32 %rd68, %r145, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra $L__BB0_72; bra.uni $L__BB0_71; $L__BB0_72: add.s32 %r148, %r69, -1; min.s32 %r171, %r47, %r148; bra.uni $L__BB0_73; $L__BB0_71: rem.s32 %r146, %r47, %r69; add.s32 %r147, %r146, %r69; rem.s32 %r171, %r147, %r69; $L__BB0_73: mad.lo.s32 %r149, %r171, %r68, %r2; mad.lo.s32 %r150, %r149, %r67, %r1; mul.wide.s32 %rd70, %r150, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra $L__BB0_75; bra.uni $L__BB0_74; $L__BB0_75: max.s32 %r172, %r51, 0; bra.uni $L__BB0_76; $L__BB0_74: rem.s32 %r151, %r51, %r69; add.s32 %r152, %r151, %r69; rem.s32 %r172, %r152, %r69; $L__BB0_76: mad.lo.s32 %r153, %r172, %r68, %r2; mad.lo.s32 %r154, %r153, %r67, %r1; mul.wide.s32 %rd72, %r154, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; $L__BB0_77: mul.wide.s32 %rd74, %r6, 4; add.s64 %rd75, %rd3, %rd74; add.s64 %rd76, %rd2, %rd74; add.s64 %rd77, %rd1, %rd74; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd76]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd77]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd75]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd78, %rd4; add.s64 %rd79, %rd78, %rd74; ld.global.f32 %f126, [%rd79]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd79], %f127; cvta.to.global.u64 %rd80, %rd5; add.s64 %rd81, %rd80, %rd74; ld.global.f32 %f128, [%rd81]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd81], %f129; cvta.to.global.u64 %rd82, %rd6; add.s64 %rd83, %rd82, %rd74; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; $L__BB0_78: ret; } ` ) 3-3.11.1/data/000077500000000000000000000000001503346766200126645ustar00rootroot000000000000003-3.11.1/data/Makefile000066400000000000000000000000241503346766200143200ustar00rootroot00000000000000all: go install -v 3-3.11.1/data/crop.go000066400000000000000000000007461503346766200141650ustar00rootroot00000000000000package data // Cut-out a piece between given bounds (incl, excl) func Crop(in *Slice, x1, x2, y1, y2, z1, z2 int) *Slice { Nx := x2 - x1 Ny := y2 - y1 Nz := z2 - z1 size := [3]int{Nx, Ny, Nz} ncomp := in.NComp() out := NewSlice(ncomp, size) a := in.Tensors() b := out.Tensors() for c := 0; c < ncomp; c++ { for z := 0; z < Nz; z++ { for y := 0; y < Ny; y++ { for x := 0; x < Nx; x++ { b[c][z][y][x] = a[c][z+z1][y+y1][x+x1] } } } } return out } 3-3.11.1/data/doc.go000066400000000000000000000001521503346766200137560ustar00rootroot00000000000000/* Package data provides structures to store arrays in a hardware-agnostic (GPU-CPU) way. */ package data 3-3.11.1/data/mesh.go000066400000000000000000000043751503346766200141600ustar00rootroot00000000000000package data import ( "fmt" "log" ) // Mesh stores info of a finite-difference mesh. type Mesh struct { gridSize [3]int cellSize [3]float64 pbc [3]int Unit string // unit of cellSize, default: "m" } // Returns a new mesh with N0 x N1 x N2 cells of size cellx x celly x cellz. // Optional periodic boundary conditions (pbc): number of repetitions // in X, Y, Z direction. 0,0,0 means no periodicity. func NewMesh(N0, N1, N2 int, cellx, celly, cellz float64, pbc ...int) *Mesh { var pbc3 [3]int if len(pbc) == 3 { copy(pbc3[:], pbc) } else { if len(pbc) != 0 { log.Panic("mesh: need 0 or 3 PBC arguments, got:", pbc) } } size := [3]int{N0, N1, N2} return &Mesh{size, [3]float64{cellx, celly, cellz}, pbc3, "m"} } // Returns N0, N1, N2, as passed to constructor. func (m *Mesh) Size() [3]int { if m == nil { return [3]int{0, 0, 0} } else { return m.gridSize } } // Returns cellx, celly, cellz, as passed to constructor. func (m *Mesh) CellSize() [3]float64 { return m.cellSize } // Returns pbc (periodic boundary conditions), as passed to constructor. func (m *Mesh) PBC() [3]int { return m.pbc } func (m *Mesh) SetPBC(nx, ny, nz int) { m.pbc = [3]int{nx, ny, nz} } // Total number of cells, not taking into account PBCs. // // N0 * N1 * N2 func (m *Mesh) NCell() int { return m.gridSize[0] * m.gridSize[1] * m.gridSize[2] } // WorldSize equals (grid)Size x CellSize. func (m *Mesh) WorldSize() [3]float64 { return [3]float64{float64(m.gridSize[0]) * m.cellSize[0], float64(m.gridSize[1]) * m.cellSize[1], float64(m.gridSize[2]) * m.cellSize[2]} } // 3 bools, packed in one byte, indicating whether there are periodic boundary conditions in // X (LSB), Y(LSB<<1), Z(LSB<<2) func (m *Mesh) PBC_code() byte { var code byte if m.pbc[X] != 0 { code = 1 } if m.pbc[Y] != 0 { code |= 2 } if m.pbc[Z] != 0 { code |= 4 } return code } func (m *Mesh) String() string { s := m.gridSize c := m.cellSize pbc := "" if m.pbc != [3]int{0, 0, 0} { pbc = fmt.Sprintf(", PBC: [%v x %v x %v],", m.pbc[0], m.pbc[1], m.pbc[2]) } return fmt.Sprintf("[%v x %v x %v] x [%vm x %vm x %vm]%v", s[0], s[1], s[2], float32(c[0]), float32(c[1]), float32(c[2]), pbc) } // product of elements. func prod(size [3]int) int { return size[0] * size[1] * size[2] } 3-3.11.1/data/meta.go000066400000000000000000000003461503346766200141440ustar00rootroot00000000000000package data // Holds meta data to be saved together with a slice. // Typically winds up in OVF or DUMP header type Meta struct { Name, Unit string Time, TimeStep float64 CellSize [3]float64 MeshUnit string } 3-3.11.1/data/resample.go000066400000000000000000000036631503346766200150330ustar00rootroot00000000000000package data import ( "github.com/mumax/3/util" ) // Resample returns a slice of new size N, // using nearest neighbor interpolation over the input slice. func Resample(in *Slice, N [3]int) *Slice { if in.Size() == N { return in // nothing to do } In := in.Tensors() out := NewSlice(in.NComp(), N) Out := out.Tensors() size1 := SizeOf(In[0]) size2 := SizeOf(Out[0]) for c := range Out { for i := range Out[c] { i1 := (i * size1[Z]) / size2[Z] for j := range Out[c][i] { j1 := (j * size1[Y]) / size2[Y] for k := range Out[c][i][j] { k1 := (k * size1[X]) / size2[X] Out[c][i][j][k] = In[c][i1][j1][k1] } } } } return out } // Downsample returns a slice of new size N, smaller than in.Size(). // Averaging interpolation over the input slice. // In is returned untouched if the sizes are equal. func Downsample(In [][][][]float32, N [3]int) [][][][]float32 { if SizeOf(In[0]) == N { return In // nothing to do } nComp := len(In) out := NewSlice(nComp, N) Out := out.Tensors() srcsize := SizeOf(In[0]) dstsize := SizeOf(Out[0]) Dx := dstsize[X] Dy := dstsize[Y] Dz := dstsize[Z] Sx := srcsize[X] Sy := srcsize[Y] Sz := srcsize[Z] scalex := Sx / Dx scaley := Sy / Dy scalez := Sz / Dz util.Assert(scalex > 0 && scaley > 0) for c := range Out { for iz := 0; iz < Dz; iz++ { for iy := 0; iy < Dy; iy++ { for ix := 0; ix < Dx; ix++ { sum, n := 0.0, 0.0 for I := 0; I < scalez; I++ { i2 := iz*scalez + I for J := 0; J < scaley; J++ { j2 := iy*scaley + J for K := 0; K < scalex; K++ { k2 := ix*scalex + K if i2 < Sz && j2 < Sy && k2 < Sx { sum += float64(In[c][i2][j2][k2]) n++ } } } } Out[c][iz][iy][ix] = float32(sum / n) } } } } return Out } // Returns the 3D size of block func SizeOf(block [][][]float32) [3]int { return [3]int{len(block[0][0]), len(block[0]), len(block)} } 3-3.11.1/data/reshape.go000066400000000000000000000011441503346766200146420ustar00rootroot00000000000000package data // Array reshaping. import "fmt" // Re-interpret a contiguous array as a multi-dimensional array of given size. // Underlying storage is shared. func reshape(array []float32, size [3]int) [][][]float32 { Nx, Ny, Nz := size[0], size[1], size[2] if Nx*Ny*Nz != len(array) { panic(fmt.Errorf("reshape: size mismatch: %v*%v*%v != %v", Nx, Ny, Nz, len(array))) } sliced := make([][][]float32, Nz) for i := range sliced { sliced[i] = make([][]float32, Ny) } for i := range sliced { for j := range sliced[i] { sliced[i][j] = array[(i*Ny+j)*Nx+0 : (i*Ny+j)*Nx+Nx] } } return sliced } 3-3.11.1/data/slice.go000066400000000000000000000175051503346766200143220ustar00rootroot00000000000000package data // Slice stores N-component GPU or host data. import ( "bytes" "fmt" "log" "reflect" "unsafe" "github.com/mumax/3/util" ) // Slice is like a [][]float32, but may be stored in GPU or host memory. type Slice struct { ptrs []unsafe.Pointer size [3]int memType int8 } // this package must not depend on CUDA. If CUDA is // loaded, these functions are set to cu.MemFree, ... // NOTE: cpyDtoH and cpyHtoD are only needed to support 32-bit builds, // otherwise, it could be removed in favor of memCpy only. var ( memFree, memFreeHost func(unsafe.Pointer) memCpy, memCpyDtoH, memCpyHtoD func(dst, src unsafe.Pointer, bytes int64) ) // Internal: enables slices on GPU. Called upon cuda init. func EnableGPU(free, freeHost func(unsafe.Pointer), cpy, cpyDtoH, cpyHtoD func(dst, src unsafe.Pointer, bytes int64)) { memFree = free memFreeHost = freeHost memCpy = cpy memCpyDtoH = cpyDtoH memCpyHtoD = cpyHtoD } // Make a CPU Slice with nComp components of size length. func NewSlice(nComp int, size [3]int) *Slice { length := prod(size) ptrs := make([]unsafe.Pointer, nComp) for i := range ptrs { ptrs[i] = unsafe.Pointer(&(make([]float32, length)[0])) } return SliceFromPtrs(size, CPUMemory, ptrs) } func SliceFromArray(data [][]float32, size [3]int) *Slice { nComp := len(data) length := prod(size) ptrs := make([]unsafe.Pointer, nComp) for i := range ptrs { if len(data[i]) != length { panic("size mismatch") } ptrs[i] = unsafe.Pointer(&data[i][0]) } return SliceFromPtrs(size, CPUMemory, ptrs) } // Return a slice without underlying storage. Used to represent a mask containing all 1's. func NilSlice(nComp int, size [3]int) *Slice { return SliceFromPtrs(size, GPUMemory, make([]unsafe.Pointer, nComp)) } // Internal: construct a Slice using bare memory pointers. func SliceFromPtrs(size [3]int, memType int8, ptrs []unsafe.Pointer) *Slice { length := prod(size) nComp := len(ptrs) util.Argument(nComp > 0 && length > 0) s := new(Slice) s.ptrs = make([]unsafe.Pointer, nComp) s.size = size for c := range ptrs { s.ptrs[c] = ptrs[c] } s.memType = memType return s } // Frees the underlying storage and zeros the Slice header to avoid accidental use. // Slices sharing storage will be invalid after Free. Double free is OK. func (s *Slice) Free() { if s == nil { return } // free storage switch s.memType { case 0: return // already freed case GPUMemory: for _, ptr := range s.ptrs { memFree(ptr) } //case UnifiedMemory: // for _, ptr := range s.ptrs { // memFreeHost(ptr) // } case CPUMemory: // nothing to do default: panic("invalid memory type") } s.Disable() } // INTERNAL. Overwrite struct fields with zeros to avoid // accidental use after Free. func (s *Slice) Disable() { s.ptrs = s.ptrs[:0] s.size = [3]int{0, 0, 0} s.memType = 0 } // value for Slice.memType const ( CPUMemory = 1 << 0 GPUMemory = 1 << 1 //UnifiedMemory = CPUMemory | GPUMemory ) // MemType returns the memory type of the underlying storage: // CPUMemory, GPUMemory or UnifiedMemory func (s *Slice) MemType() int { return int(s.memType) } // GPUAccess returns whether the Slice is accessible by the GPU. // true means it is either stored on GPU or in unified host memory. func (s *Slice) GPUAccess() bool { return s.memType&GPUMemory != 0 } // CPUAccess returns whether the Slice is accessible by the CPU. // true means it is stored in host memory. func (s *Slice) CPUAccess() bool { return s.memType&CPUMemory != 0 } // NComp returns the number of components. func (s *Slice) NComp() int { return len(s.ptrs) } // Len returns the number of elements per component. func (s *Slice) Len() int { return prod(s.size) } func (s *Slice) Size() [3]int { if s == nil { return [3]int{0, 0, 0} } return s.size } // Comp returns a single component of the Slice. func (s *Slice) Comp(i int) *Slice { sl := new(Slice) sl.ptrs = make([]unsafe.Pointer, 1) sl.ptrs[0] = s.ptrs[i] sl.size = s.size sl.memType = s.memType return sl } // DevPtr returns a CUDA device pointer to a component. // Slice must have GPUAccess. // It is safe to call on a nil slice, returns NULL. func (s *Slice) DevPtr(component int) unsafe.Pointer { if s == nil { return nil } if !s.GPUAccess() { panic("slice not accessible by GPU") } return s.ptrs[component] } const SIZEOF_FLOAT32 = 4 // Host returns the Slice as a [][]float32 indexed by component, cell number. // It should have CPUAccess() == true. func (s *Slice) Host() [][]float32 { if !s.CPUAccess() { log.Panic("slice not accessible by CPU") } list := make([][]float32, s.NComp()) for c := range list { hdr := (*reflect.SliceHeader)(unsafe.Pointer(&list[c])) hdr.Data = uintptr(s.ptrs[c]) hdr.Len = s.Len() hdr.Cap = hdr.Len } return list } // Returns a copy of the Slice, allocated on CPU. func (s *Slice) HostCopy() *Slice { cpy := NewSlice(s.NComp(), s.Size()) Copy(cpy, s) return cpy } func Copy(dst, src *Slice) { if dst.NComp() != src.NComp() || dst.Len() != src.Len() { panic(fmt.Sprintf("slice copy: illegal sizes: dst: %vx%v, src: %vx%v", dst.NComp(), dst.Len(), src.NComp(), src.Len())) } d, s := dst.GPUAccess(), src.GPUAccess() bytes := SIZEOF_FLOAT32 * int64(dst.Len()) switch { default: panic("bug") case d && s: for c := 0; c < dst.NComp(); c++ { memCpy(dst.DevPtr(c), src.DevPtr(c), bytes) } case s && !d: for c := 0; c < dst.NComp(); c++ { memCpyDtoH(dst.ptrs[c], src.DevPtr(c), bytes) } case !s && d: for c := 0; c < dst.NComp(); c++ { memCpyHtoD(dst.DevPtr(c), src.ptrs[c], bytes) } case !d && !s: dst, src := dst.Host(), src.Host() for c := range dst { copy(dst[c], src[c]) } } } // Floats returns the data as 3D array, // indexed by cell position. Data should be // scalar (1 component) and have CPUAccess() == true. func (f *Slice) Scalars() [][][]float32 { x := f.Tensors() if len(x) != 1 { panic(fmt.Sprintf("expecting 1 component, got %v", f.NComp())) } return x[0] } // Vectors returns the data as 4D array, // indexed by component, cell position. Data should have // 3 components and have CPUAccess() == true. func (f *Slice) Vectors() [3][][][]float32 { x := f.Tensors() if len(x) != 3 { panic(fmt.Sprintf("expecting 3 components, got %v", f.NComp())) } return [3][][][]float32{x[0], x[1], x[2]} } // Tensors returns the data as 4D array, // indexed by component, cell position. // Requires CPUAccess() == true. func (f *Slice) Tensors() [][][][]float32 { tensors := make([][][][]float32, f.NComp()) host := f.Host() for i := range tensors { tensors[i] = reshape(host[i], f.Size()) } return tensors } // IsNil returns true if either s is nil or s.pointer[0] == nil func (s *Slice) IsNil() bool { if s == nil { return true } return s.ptrs[0] == nil } func (s *Slice) String() string { if s == nil { return "nil" } var buf bytes.Buffer util.Fprint(&buf, s.Tensors()) return buf.String() } func (s *Slice) Set(comp, ix, iy, iz int, value float64) { s.checkComp(comp) s.Host()[comp][s.Index(ix, iy, iz)] = float32(value) } func (s *Slice) SetVector(ix, iy, iz int, v Vector) { i := s.Index(ix, iy, iz) for c := range v { s.Host()[c][i] = float32(v[c]) } } func (s *Slice) SetScalar(ix, iy, iz int, v float64) { s.Host()[0][s.Index(ix, iy, iz)] = float32(v) } func (s *Slice) Get(comp, ix, iy, iz int) float64 { s.checkComp(comp) return float64(s.Host()[comp][s.Index(ix, iy, iz)]) } func (s *Slice) checkComp(comp int) { if comp < 0 || comp >= s.NComp() { panic(fmt.Sprintf("slice: invalid component index: %v (number of components=%v)\n", comp, s.NComp())) } } func (s *Slice) Index(ix, iy, iz int) int { return Index(s.Size(), ix, iy, iz) } func Index(size [3]int, ix, iy, iz int) int { if ix < 0 || ix >= size[X] || iy < 0 || iy >= size[Y] || iz < 0 || iz >= size[Z] { panic(fmt.Sprintf("Slice index out of bounds: %v,%v,%v (bounds=%v)\n", ix, iy, iz, size)) } return (iz*size[Y]+iy)*size[X] + ix } 3-3.11.1/data/slice_test.go000066400000000000000000000006321503346766200153520ustar00rootroot00000000000000package data import ( "testing" ) func TestIndex(t *testing.T) { mesh := [3]int{6, 5, 4} slice := NewSlice(7, mesh) data := slice.Tensors() if len(data) != 7 { //c t.Fail() } if len(data[0]) != 4 { // z t.Fail() } if len(data[0][0]) != 5 { // y t.Fail() } if len(data[0][0][0]) != 6 { // x t.Fail() } slice.Set(4, 5, 4, 3, 345) // c x y z if data[4][3][4][5] != 345 { t.Fail() } } 3-3.11.1/data/vector.go000066400000000000000000000023371503346766200145220ustar00rootroot00000000000000package data import "math" // 3-component vector type Vector [3]float64 func (v Vector) X() float64 { return v[0] } func (v Vector) Y() float64 { return v[1] } func (v Vector) Z() float64 { return v[2] } // Returns a*v. func (v Vector) Mul(a float64) Vector { return Vector{a * v[0], a * v[1], a * v[2]} } // Returns (1/a)*v. func (v Vector) Div(a float64) Vector { return v.Mul(1 / a) } // Returns a+b. func (a Vector) Add(b Vector) Vector { return Vector{a[0] + b[0], a[1] + b[1], a[2] + b[2]} } // Returns a+s*b. func (a Vector) MAdd(s float64, b Vector) Vector { return Vector{a[0] + s*b[0], a[1] + s*b[1], a[2] + s*b[2]} } // Returns a-b. func (a Vector) Sub(b Vector) Vector { return Vector{a[0] - b[0], a[1] - b[1], a[2] - b[2]} } // Returns the norm of v. func (v Vector) Len() float64 { len2 := v.Dot(v) return math.Sqrt(len2) } // Returns the dot (inner) product a.b. func (a Vector) Dot(b Vector) float64 { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] } // Returns the cross (vector) product a x b // in a right-handed coordinate system. func (a Vector) Cross(b Vector) Vector { x := a[1]*b[2] - a[2]*b[1] y := a[2]*b[0] - a[0]*b[2] z := a[0]*b[1] - a[1]*b[0] return Vector{x, y, z} } const ( X = 0 Y = 1 Z = 2 ) 3-3.11.1/deploy/000077500000000000000000000000001503346766200132475ustar00rootroot000000000000003-3.11.1/deploy/.gitignore000066400000000000000000000000071503346766200152340ustar00rootroot00000000000000build/ 3-3.11.1/deploy/deploy_linux.bash000077500000000000000000000070001503346766200166210ustar00rootroot00000000000000#!/bin/bash # Optional arguments. CUDA_VERSIONS must be supplied if CUDA_CC is specified. # Example usage: ./deploy_linux.bash "12.6" "86 87 89" DEFAULT_CUDA_VERSIONS=("10.0" "10.1" "10.2" "11.0" "12.0" "12.6" "12.9") INPUT_CUDA_VERSIONS=(${1:-${DEFAULT_CUDA_VERSIONS[@]}}) INPUT_CUDA_CC="$2" # Optional string: "86 87 89" # The cuda versions against which we will compile mumax3 for CUDAVERSION in "${INPUT_CUDA_VERSIONS[@]}"; do #! NOTE: each CUDA version has a MAXIMUM GCC version: https://stackoverflow.com/a/46380601 #! EDIT IF-ELSE BELOW TO REFER TO YOUR INSTALLED GCC VERSION(S)! if [ 1 -eq "$(echo "${CUDAVERSION} < 11.0" | bc)" ]; then export NVCC_CCBIN=/usr/bin/gcc-7 else export NVCC_CCBIN=/usr/bin/gcc fi # The final location of the mumax3 executables and libs MUMAX3UNAME=mumax3.11.1_linux_cuda${CUDAVERSION} BUILDDIR=./build/${MUMAX3UNAME} rm -rf $BUILDDIR mkdir -p $BUILDDIR # The location of the home dirctory of this cuda version # We export this variable so that cuda/Makefile knows how to build the wrappers export CUDA_HOME=/usr/local/cuda-${CUDAVERSION} # All supported compute capabilities of this cuda version # See supported CC for each CUDA version at https://stackoverflow.com/a/28933055 # See min. driver version for each CUDA version at https://docs.nvidia.com/deploy/cuda-compatibility/#minor-version-compatibility # We export CUDA_CC so that cuda/Makefile knows what to include in the fat wrappers if [ -z "$INPUT_CUDA_CC" ]; then case $CUDAVERSION in "10.0") export CUDA_CC="50 52 53 60 61 62 70 72 75";; # Min. Linux driver: >=410.48 "10.1") export CUDA_CC="50 52 53 60 61 62 70 72 75";; # Min. Linux driver: >=418.39 "10.2") export CUDA_CC="50 52 53 60 61 62 70 72 75";; # Min. Linux driver: >=440.33 "11.0") export CUDA_CC="50 52 53 60 61 62 70 72 75 80";; # Min. Linux driver: >=450.80.02 "12.0") export CUDA_CC="50 52 53 60 61 62 70 72 75 80 86 87 89 90";; # Min. Linux driver: >=525.60.13 (same for all 12.x) "12.6") export CUDA_CC="50 52 53 60 61 62 70 72 75 80 86 87 89 90";; # Highest CUDA version supporting CC < 7.5 "12.9") export CUDA_CC=" 75 80 86 87 89 90 100 120";; esac else export CUDA_CC="$INPUT_CUDA_CC" fi # Print important variables (debug info) echo "[INFO] CUDA Versions: ${INPUT_CUDA_VERSIONS[*]}" echo "[INFO] Compute Capabilities: ${INPUT_CUDA_CC}" echo "[INFO] GOPATH: ${GOPATH}" # The path for shared libraries (relative to the build directory) RPATH=lib mkdir -p $BUILDDIR/$RPATH # We overwrite the CGO Flags to make sure that it is compiled against $CUDAVERSION export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH export CGO_LDFLAGS="-lcufft -lcurand -lcuda -L${CUDA_HOME}/lib64 -Wl,-rpath -Wl,\$ORIGIN/$RPATH" export CGO_CFLAGS="-I${CUDA_HOME}/include" # (Re)build everything (cd .. && make realclean && make -j 4 || exit 1) # Copy the executable and the cuda libraries to the output directory cp $GOPATH/bin/mumax3 $BUILDDIR cp $GOPATH/bin/mumax3-convert $BUILDDIR cp $GOPATH/bin/mumax3-server $BUILDDIR cp ../LICENSE $BUILDDIR cp $( ldd ${BUILDDIR}/mumax3 | grep libcufft | awk '{print $3}' ) ${BUILDDIR}/${RPATH} cp $( ldd ${BUILDDIR}/mumax3 | grep libcurand | awk '{print $3}' ) ${BUILDDIR}/${RPATH} (cd build && tar -czf ${MUMAX3UNAME}.tar.gz ${MUMAX3UNAME}) done 3-3.11.1/deploy/deploy_windows.ps1000066400000000000000000000140131503346766200167410ustar00rootroot00000000000000# This script compiles mumax3 for windows 10 against multiple cuda versions. param ( # Optional arguments. Example usage: ./deploy_windows.ps1 -CUDA_VERSIONS 12.6 -CUDA_CC 86 [String[]]$CUDA_VERSIONS = ("10.0","10.1","10.2","11.0","12.0","12.6","12.9"), # The cuda versions against which we will compile mumax3 [Int[]]$CUDA_CC, # The compute capabilities for which PTX will be compiled. Default: all CC supported by the CUDA version. [String[]]$CUDA_KERNELS # List of which CUDA kernels in ../cuda should be (re)compiled. Default: all of them. ) foreach ($CUDA_VERSION_STR in $CUDA_VERSIONS ) { # The final location of executables and libraries ready to be shipped to the user. $builddir = "build/mumax3.11.1_windows_cuda$CUDA_VERSION_STR" # The nvidia toolkit installer for CUDA 12.6 should have set the environment # variable CUDA_PATH_V12_6 which points to the root directory of the # CUDA toolbox (or similar for other CUDA versions). $CUDA_VERSION = [Version]::Parse($CUDA_VERSION_STR) # Convert to Version type for easy handling and comparison $CUDA_ENV_VAR_NAME = "CUDA_PATH_V$($CUDA_VERSION.Major)_$($CUDA_VERSION.Minor)" $CUDA_HOME = [Environment]::GetEnvironmentVariable($CUDA_ENV_VAR_NAME, "Machine") # "Machine" for system-wide environment variables if ( -not $CUDA_HOME -or (-not ( Test-Path $CUDA_HOME )) ) { Write-Output "CUDA version $CUDA_VERSION_STR does not seem to be installed" Write-Output "(system-wide environment variable $CUDA_ENV_VAR_NAME does not exist or points to nonexistent directory)" exit } #! SUBSTITUTE YOUR OWN PATH TO cl.exe BELOW # Not every CUDA version is compatible with any Visual C/C++ version: compiling for CUDA <11.6 requires VS <=2017. # See VS/CUDA compatibility matrix at https://quasar.ugent.be/files/doc/cuda-msvc-compatibility.html (with old VS downloads available). $VS2022 = "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.44.35207\bin\Hostx64\x64" # Supported by CUDA v11.6-v12.* $VS2017 = "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\bin\Hostx64\x64" # Supported by CUDA v8.0-v12.4 switch ( $CUDA_VERSION ) { {$_ -lt [Version]::new(11.6)} { $CCBIN = $VS2017 } {$_ -ge [Version]::new(11.6)} { $CCBIN = if ($VS2022) {$VS2022} else {$VS2017} } # Use VS2017 if 2022 not installed default { Write-Output "Failed to parse CUDA version $CUDA_VERSION_STR" } } if ( -not ( Test-Path $CCBIN ) ) { Write-Output "CCBIN for nvcc not found at $CCBIN" exit } # We will compile the kernels for all supported architectures # See supported CC for each CUDA version at https://stackoverflow.com/a/28933055 # See min. driver version for each CUDA version at https://docs.nvidia.com/deploy/cuda-compatibility/#minor-version-compatibility if ( -not $CUDA_CC ) { switch ( $CUDA_VERSION_STR ) { "10.0" { $CUDA_CC = 50,52,53,60,61,62,70,72,75 } # Min. Windows driver: >=411.31 "10.1" { $CUDA_CC = 50,52,53,60,61,62,70,72,75 } # Min. Windows driver: >=418.96 "10.2" { $CUDA_CC = 50,52,53,60,61,62,70,72,75 } # Min. Windows driver: >=441.22 "11.0" { $CUDA_CC = 50,52,53,60,61,62,70,72,75,80 } # Min. Windows driver: >=452.39 "12.0" { $CUDA_CC = 50,52,53,60,61,62,70,72,75,80,86,87,89,90 } # Min. Windows driver: >=527.41 (Same for all 12.x) "12.6" { $CUDA_CC = 50,52,53,60,61,62,70,72,75,80,86,87,89,90 } # Highest CUDA version supporting CC < 7.5 "12.9" { $CUDA_CC = 75,80,86,87,89,90,100,120 } default {exit} } } # The NVIDIA compiler which will be used to compile the cuda kernels $NVCC = "${CUDA_HOME}/bin/nvcc.exe" # overwrite the CGO flags to make sure that mumax3 is compiled against the # specified cuda version. $env:CGO_LDFLAGS="-lcufft -lcurand -lcuda -L `"$CUDA_HOME/lib/x64`"" $env:CGO_CFLAGS="-I `"$CUDA_HOME/include`" -w" # Enter the cuda directory to (re)compile the cuda kernels Set-Location ../cuda go build .\cuda2go.go if ($CUDA_KERNELS.Length -eq 0) { $cudafiles = Get-ChildItem -filter "*.cu" } else { $cudafiles = Get-ChildItem -Filter "*.cu" | Where-Object {$CUDA_KERNELS -contains $_.BaseName} } foreach ($cudafile in $cudafiles) { $kernelname = $cudafile.basename Remove-Item "${kernelname}_*.ptx" Remove-Item "${kernelname}_*wrapper.go" foreach ($cc in $CUDA_CC) { & $NVCC -ccbin "`"${CCBIN}`"" -Xptxas -O3 -ptx ` -gencode="arch=compute_${cc},code=sm_${cc}" ` "${cudafile}" -o "${kernelname}_${cc}.ptx" } & .\cuda2go $cudafile gofmt -w "${kernelname}_wrapper.go" } Set-Location ../deploy # Compile all mumax3 packages and executables. Determine the commit hash and pass it along. $COMMIT_HASH = git rev-parse --short HEAD 2>$null if (-not $COMMIT_HASH) { $COMMIT_HASH = "unknown" Write-Host "Warning: Could not determine Git commit hash. Using 'unknown'." } go install -ldflags "-X main.commitHash=$COMMIT_HASH" -v "github.com/mumax/3/..." # Copy the mumax3 executables and the used cuda libraries to the build directory Remove-Item -ErrorAction Ignore -Recurse ${builddir} Remove-Item -ErrorAction Ignore "${builddir}.zip" New-Item -ItemType "directory" ${builddir} Copy-Item ${env:GOPATH}/bin/mumax3.exe -Destination ${builddir} Copy-Item ${env:GOPATH}/bin/mumax3-convert.exe -Destination ${builddir} Copy-Item ${env:GOPATH}/bin/mumax3-server.exe -Destination ${builddir} Copy-Item ../LICENSE -Destination ${builddir} Copy-Item ${CUDA_HOME}/bin/cufft64*.dll -Destination ${builddir} Copy-Item ${CUDA_HOME}/bin/curand64*.dll -Destination ${builddir} # Finally, put everything in a single archive Compress-Archive -Path ${builddir}/* -DestinationPath "${builddir}.zip" }3-3.11.1/doc/000077500000000000000000000000001503346766200125205ustar00rootroot000000000000003-3.11.1/doc/.gitignore000066400000000000000000000000111503346766200145000ustar00rootroot00000000000000build doc3-3.11.1/doc/Makefile000066400000000000000000000010031503346766200141520ustar00rootroot00000000000000BUILDDIR="build" STATIC="static" BENCH="../bench" # build the html pages in ${BUILDDIR} .PHONY: html html: doc mumax3libs gpus mkdir -p ${BUILDDIR} ./doc -examples -builddir ${BUILDDIR} cp ${STATIC}/* build .PHONY: doc doc: go build -v .PHONY: mumax3libs mumax3libs: go install -v github.com/mumax/3/cmd/... .PHONY: gpus gpus: # Only attempt move when gnuplot succeeded if (cd ${BENCH} && gnuplot gpus.gplot); then \ mv ${BENCH}/gpus.svg ${STATIC}/; \ fi .PHONY: clean clean: rm -rf build rm -f doc3-3.11.1/doc/README000066400000000000000000000002361503346766200134010ustar00rootroot00000000000000This directory contains everything needed to build the mumax3 website (Home page, API, and examples) "make html" builds the complete website in ${BUILDDIR}. 3-3.11.1/doc/apigen.go000066400000000000000000000127061503346766200143200ustar00rootroot00000000000000// Automatic generation of api.html based on template. package main import ( "os" "os/exec" "path" "reflect" "sort" "strings" "text/template" "unicode" "github.com/mumax/3/cuda" "github.com/mumax/3/engine" ) var ( api_entries entries api_ident = make(map[string]entry) ) type entry struct { name string Type reflect.Type Doc string touched bool } func buildAPI() { cuda.Init(0) // gpu 0 ident := engine.World.Identifiers doc := engine.World.Doc e := make(entries, 0, len(ident)) for K, v := range doc { if v == "" { // check if we have a docstring in the documentation of the Math package v = getGoDocString("math", K) } k := strings.ToLower(K) t := ident[k].Type() entr := entry{K, t, v, false} e = append(e, &entr) api_ident[k] = entr } sort.Sort(&e) api_entries = e } func getGoDocString(packageName, identifier string) string { docString := "" cmd := exec.Command("go", "doc", packageName, identifier) stdout, err := cmd.Output() outputLines := strings.Split(string(stdout), "\n") if err == nil && outputLines[2][:4] == "func" { // we only look for doc strings of functions // the doc string of a function is the paragraph starting on the fourth line var docLines []string for i := 3; i < len(outputLines); i++ { if strings.TrimSpace(outputLines[i]) == "" { // Stop at the first empty line break } docLines = append(docLines, outputLines[i]) } docString = strings.Join(docLines, " ") } return docString } func (e *entry) Name() string { return e.name } // input parameters func (e *entry) Ins() string { t := e.Type.String() if strings.HasPrefix(t, "func(") { return cleanType(t[len("func"):]) } else { return "" } } // dumbed-down type func cleanType(typ string) string { typ = strings.Replace(typ, "engine.", "", -1) typ = strings.Replace(typ, "*data.", "", -1) typ = strings.Replace(typ, "script.", "", -1) return typ } func (e *entry) Methods() []string { t := e.Type // if it's a function, we list the methods on the output type if t.Kind() == reflect.Func && t.NumOut() == 1 { t = t.Out(0) } nm := t.NumMethod() M := make([]string, 0, nm) for i := 0; i < nm; i++ { m := t.Method(i) n := m.Name if unicode.IsUpper(rune(n[0])) && !hidden(n) { var args string for i := 1; i < m.Type.NumIn(); i++ { args += cleanType(m.Type.In(i).String()) + " " } M = append(M, n+"( "+args+")") } } return M } // return value func (e *entry) Ret() string { t := e.Type if t.Kind() == reflect.Func && t.NumOut() == 1 { return cleanType(t.Out(0).String()) } else { return "" } } // hidden methods func hidden(name string) bool { switch name { default: return false case "Eval", "InputType", "Type", "Slice", "Name", "Unit", "NComp", "Mesh", "SetValue", "String": return true } } // list of examples where entry is used. func (e *entry) Examples() []int { return api_examples[strings.ToLower(e.name)] } type api struct { Entries entries } // include file func (e *api) Include(fname string) string { b, err := os.ReadFile(path.Join(templateDir, fname)) check(err) return string(b) } // list of entries not used so far func (a *api) remaining() []*entry { var E []*entry for _, e := range a.Entries { if !e.touched { E = append(E, e) } } return E } // list of all entries (touched and not touched) func (a *api) All() []*entry { var E []*entry for _, e := range a.Entries { E = append(E, e) } return E } // return all entries, unused so far, which have given type. func (a *api) FilterType(typ ...string) []*entry { var E []*entry for _, e := range a.Entries { for _, t := range typ { if match(t, e.Type.String()) && !strings.HasPrefix(e.name, "ext_") { e.touched = true E = append(E, e) } } } return E } // return all entries, unused so far, which have given return type. func (a *api) FilterReturn(typ ...string) []*entry { var E []*entry for _, e := range a.Entries { for _, t := range typ { if match(t, e.Ret()) && !strings.HasPrefix(e.name, "ext_") { e.touched = true E = append(E, e) } } } return E } // return all entries, unused so far, which have given name. func (a *api) FilterName(typ ...string) []*entry { var E []*entry for _, e := range a.Entries { for _, t := range typ { if match(t, e.name) { e.touched = true E = append(E, e) } } } return E } // return all entries, unused so far, whose name starts with prefix. func (a *api) FilterPrefix(pre string) []*entry { var E []*entry for _, e := range a.Entries { if strings.HasPrefix(e.name, pre) { e.touched = true E = append(E, e) } } return E } // return all entries not yet used. func (a *api) FilterLeftovers() []*entry { return a.remaining() } // case insensitive match. func match(a, b string) bool { a = strings.ToLower(a) b = strings.ToLower(b) match := a == b return match } func renderAPI() { e := api_entries t := template.Must(template.New("api").Parse(templ)) f, err2 := os.OpenFile(path.Join(buildDir, "api.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) check(t.Execute(f, &api{e})) } var templ = read(path.Join(templateDir, "api-template.html")) func read(fname string) string { b, err := os.ReadFile(fname) check(err) return string(b) } type entries []*entry func (e *entries) Len() int { return len(*e) } func (e *entries) Less(i, j int) bool { return strings.ToLower((*e)[i].name) < strings.ToLower((*e)[j].name) } func (e *entries) Swap(i, j int) { (*e)[i], (*e)[j] = (*e)[j], (*e)[i] } 3-3.11.1/doc/gpus.go000066400000000000000000000041161503346766200140270ustar00rootroot00000000000000package main import ( "bytes" "encoding/xml" "fmt" "os" ) // Injects a ` // Read the SVG file content, err := os.ReadFile(SVGpath) if err != nil { return fmt.Errorf("error reading SVG file: %w", err) } // Create a buffer to write the modified content var buffer bytes.Buffer decoder := xml.NewDecoder(bytes.NewReader(content)) styleInjected := false for { // Read each token token, err := decoder.Token() if err != nil { if err.Error() == "EOF" { break } return fmt.Errorf("error reading SVG tokens: %w", err) } // Process tokens and write directly to the buffer switch t := token.(type) { case xml.ProcInst: // Write the XML declaration buffer.WriteString(fmt.Sprintf("\n", t.Target, string(t.Inst))) case xml.StartElement: // Write the tag buffer.WriteString("<" + t.Name.Local) for _, attr := range t.Attr { buffer.WriteString(fmt.Sprintf(` %s="%s"`, attr.Name.Local, attr.Value)) } buffer.WriteString(">") // Inject the style immediately after the tag if t.Name.Local == "svg" && !styleInjected { buffer.WriteString(styleTag) styleInjected = true } case xml.EndElement: // Write end elements buffer.WriteString(fmt.Sprintf("", t.Name.Local)) case xml.CharData: // Write character data buffer.WriteString(string(t)) default: return fmt.Errorf("unexpected token type: %T", token) } } // Write the modified content back to the file if err := os.WriteFile(SVGpath, buffer.Bytes(), 0644); err != nil { return fmt.Errorf("error writing modified SVG file: %w", err) } return nil } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������3-3.11.1/doc/make.go��������������������������������������������������������������������������������0000664�0000000�0000000�00000013760�15033467662�0013773�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������package main import ( "flag" "fmt" "log" "os" "os/exec" "path" "regexp" "sort" "strings" "text/template" ) var flag_vet = flag.Bool("vet", false, "only vet source files, don't run them") var flag_examples = flag.Bool("examples", false, "run mumax3 examples") var flag_forced = flag.Bool("forced", false, "force to re-run mumax3 examples") var flag_builddir = flag.String("builddir", "build", "build directory") var buildDir string const templateDir = "templates" func main() { flag.Parse() buildDir = *flag_builddir + "/" buildAPI() // read template b, err := os.ReadFile(path.Join(templateDir, "examples-template.html")) check(err) replaceInRaw(b, '\n', '@') // hack to allow raw strings spanning multi lines templ := template.Must(template.New("guide").Parse(string(b))) // output file f, err2 := os.OpenFile(path.Join(buildDir, "examples.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) // execute! if *flag_examples { state := &State{} check(templ.Execute(f, state)) } renderAPI() postProcessGPUsSVG() createIndexPage() createDownloadPage() createHeaderPage() } func createIndexPage() { b, err := os.ReadFile(path.Join(templateDir, "index-template.html")) replaceInRaw(b, '\n', '@') // hack to allow raw strings spanning multi lines check(err) templ := template.Must(template.New("guid").Parse(string(b))) f, err2 := os.OpenFile(path.Join(buildDir, "index.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) state := &State{} check(templ.Execute(f, state)) } func createDownloadPage() { b, err := os.ReadFile(path.Join(templateDir, "download-template.html")) replaceInRaw(b, '\n', '@') // hack to allow raw strings spanning multi lines check(err) templ := template.Must(template.New("download").Parse(string(b))) f, err2 := os.OpenFile(path.Join(buildDir, "download.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) state := &State{} check(templ.Execute(f, state)) } func createHeaderPage() { b, err := os.ReadFile(path.Join(templateDir, "headerpage-template.html")) replaceInRaw(b, '\n', '@') // hack to allow raw strings spanning multi lines check(err) templ := template.Must(template.New("headerpage").Parse(string(b))) f, err2 := os.OpenFile(path.Join(buildDir, "header.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) state := &State{} check(templ.Execute(f, state)) } type State struct { count int } func (s *State) Example(in string) string { s.count++ // extract example source in = strings.Replace(in, "@", "\n", -1) // undo raw string hack in = strings.Trim(in, "\n") // exec input file check(os.WriteFile(s.infile(), []byte(in), 0666)) arg := "-v" if *flag_vet { arg = "-vet" } if _, err := os.Stat(s.outfile()); os.IsNotExist(err) || *flag_forced { cmd("mumax3", "-cache", "/tmp", arg, s.infile()) } recordExamples(in, s.count) return `
` + template.HTMLEscapeString(in) + `
` } var api_examples = make(map[string][]int) func recordExamples(input string, num int) { in := strings.ToLower(input) for k := range api_ident { if ok, _ := regexp.MatchString(k, in); ok { api_examples[k] = append(api_examples[k], num) } } } func (s *State) Img(fname string) string { cmd("mumax3-convert", "-png", "-arrows", "16", path.Join(s.outfile(), fname+".ovf")) pngfile := path.Join(s.relativeOutfile(), fname+".png") return fmt.Sprintf(`
%v
`, pngfile, fname) } func (s *State) Include(fname string) string { b, err := os.ReadFile(path.Join(templateDir, fname)) check(err) return string(b) } func (s *State) Output() string { out := `

output

` dir, err := os.Open(s.outfile()) check(err) files, err2 := dir.Readdirnames(-1) check(err2) sort.Strings(files) for _, f := range files { if path.Ext(f) == ".ovf" { out += s.Img(f[:len(f)-len(".ovf")]) } } out += `
` for _, f := range files { if f == "table.txt" { cmd("mumax3-plot", path.Join(s.outfile(), f)) } } dir, err = os.Open(s.outfile()) check(err) files, err2 = dir.Readdirnames(-1) check(err2) sort.Strings(files) for _, f := range files { if path.Ext(f) == ".svg" { src := path.Join(s.relativeOutfile(), f) out += fmt.Sprintf(`
%v
`, src, f) } } return out } // State.output gives a nice output for all examples except for the // hysteresis example. State.OutputHysteresis is the custom output function // for the hysteresis example. func (s *State) OutputHysteresis() string { tableName := path.Join(s.outfile(), "table.txt") figureName := path.Join(s.outfile(), "hysteresis.svg") relFigureName := path.Join(s.relativeOutfile(), "hysteresis.svg") gnuplotCmd := `set term svg noenhanced size 400 300 font 'Arial,10';` gnuplotCmd += fmt.Sprintf(`set output "%s";`, figureName) gnuplotCmd += `set xlabel "B_ext(T)";` gnuplotCmd += `set ylabel "m_x";` gnuplotCmd += fmt.Sprintf(`plot "%s" u 5:2 w lp notitle;`, tableName) gnuplotCmd += "set output;" gnuplotOut, err := exec.Command("gnuplot", "-e", gnuplotCmd).CombinedOutput() os.Stderr.Write(gnuplotOut) check(err) out := fmt.Sprintf(`

output

`, relFigureName) return out } func (s *State) infile() string { return path.Join(buildDir, fmt.Sprintf("example%v.mx3", s.count)) } func (s *State) outfile() string { return path.Join(buildDir, fmt.Sprintf("example%v.out", s.count)) } // Relative output directory path from the build directory func (s *State) relativeOutfile() string { return fmt.Sprintf("example%v.out", s.count) } func cmd(cmd string, args ...string) { out, err := exec.Command(cmd, args...).CombinedOutput() os.Stdout.Write(out) check(err) } func replaceInRaw(bytes []byte, old, new byte) { inraw := false for i, b := range bytes { if b == '`' { inraw = !inraw } if inraw && b == old { bytes[i] = new } } } func check(err error) { if err != nil { log.Panic(err) } } 3-3.11.1/doc/mask.png000066400000000000000000000047701503346766200141710ustar00rootroot00000000000000PNG  IHDRdÆ bKGD pHYs  tIME %xtEXtCommentCreated with GIMPW `IDATxiHTǟqfR)ۗRC(2i3IdX-PP&"RWEHM"J[lQsǜ{fs~ss<ϽsȘ 0(>HA @ A @ A HI\t>L:~AFMf͢KҎ;(::f}2 Id2ۿ?X3Ӆ xM2Lj_\\!DD4u?P(wZ퀿i4 ߋlv*--<;;[i^0"5BA^~M('JEBUUU۷oSrr2i4R*V)""vE#yJ:;;ÇTRRBGu… iDJr9jh4xbڵkSGGWx) H03[Ve___ᱛ7ofG---+#"޴iz[uWHʓIIIaiiiz m3??_XGzzSO.ٳgmɕDZ,NLL;K,auuubwlll,y ޖ'{Kpp0Ka4yJ%::{{{h4Wǎ;˗/O}9L~=O0؜H 䦦&vZZZ8((H2~ܹz̙#Vݻwꉊr \ ,p*ߟu: myr ۺy0~Ϟ=bv!yL&[ns@] ³q8_LUWy8q"gdd˗`0baN/^^hG ;t0W =vؠm$%%ׯ_%PQQ1qƹWLEEE,:3_3g۷ow*iٲe\VVVծX,޺ud{'Oƛf^tdqӧO;lʕlX$A@@:tkjjXqOO777s^^3fЉ ԩSfffJ̛7ϩa ͒ی#?^XGLL'Oϟ?]\Pg'Ȃ ?I]>}*7a7lxbjJjW3كl;e_V%W,„iii###%t:S}w7'HTUUE_oߒV%HFwrss)''ǡ>}/_.<oRWWuuuj@6mEEEQBB%%%JrBd0vJT*y欬,I{'Ay͚5v׽qF.\D DvE&婵UlqNh46 ݸ4=z.HxV[!S^w˓d6iÆ ͛aBVy\LL _Gݹ2=|xbyJŋ1| L!_Eiiwk׮ycE ߾}^pa ?8=$ &!U E(Ւq&#""F SLyYLg?zm>xBk?>.Sn?>Cq? >,t?ϙ>qF~Sv?ak>S $x?Y{>y?azb>ZԆ z?J>T{?`'5>V;{|?w!>x$}?R>sg ~?=ܬ[~~? =ȀRZ~?XP=UD?k=7hU?9=S )?=W!h?>(]=tH?s=>=Ș ?"=BN?V =q?<@γ?ҵ}'1?t݀2z?O1u?q^L2p?χ1k?ao2d?51^?ꏻCG2W?Kܒ[l1Q?啻F2J?71C?#LQ1;?Eh12? YU1,?O"1%?}N1?U%?ì/?uKc?Ց7?nձ?tޱ?孻֢5?22q9'?}æ|6?tJ? \b?5~?ڲ?xa/粽?!\:% ?R ?=*?at}5?)y:O?M;[}?^;j1y?$;𹅳?<}{?P)<*꛳X?8AW<ꬳO?Ll(r&}?TW>2c|?[]+>Lq{?W@>qEz?pdW>+&x?ԕp>q´\w?>wдt?>+r?b>bo?x>jfl?j>$Fj?g>4h>? *?ctsBE?+#?EmL? ?:#|S?i-?a3Y?v?~x_?(>mJWpd?>>u@h?f>kXl?ٌ>PF o?ֲ>3%r?Y>Ϻu?͓>Ɖ"w?Y>2x?_mp>LCz?܉W>Oo{?x@>Bxye|?5+>Mo,}?>Ie}?>YUL~??=[G߱~?Ȋ=4D\?\= 1W?? =3,Oo?2=F*?j=֑J?I=?,=u? U==F۳?&J1?=QJ]f?;`m/?1M]?0?P*?-?wr ? X:?1؀3?kP?_UͲp?qk|?mQ?s CR? ??*C?:n=?&>,;ɀja?p#;ۃh?;R?+G^z[}?L>{â|?w%>(^{?)':>󘌴ڜz?CP>I89y?3 j>訴nw?>ϰlu?>KӴr?K><ߴo?K>-zl?>ph?\>[x*e?3>_38?\E1?5??)?gzj;G? ?hV/N??:]UzU?J ?Fj_ [??r_a?>w&}e?<>xCj?>b3m?>  p?>ݍs?>{u?6 >:w?>z녴~y?'je>ioywz?L>!{?L6>lI|?Ό!>WlD}?W>3Ty~?=b>Wi~?=M8~? =@ &?A=RY x\?=+D?y= !?$pV= =?r7=,VY?+= ?;=5ó?<ͳ?H<^qg?!<2?Q|<{?M<?:#<{??;j?\;c?^;:<?[A;jԲ?H:?L:?V?KA??J? ?Z?i.`?ͷA#G?Q 1?_x?:j1?s?N{1?Yy?,Ճٛ2s?[`!/o?o_P%2k?ŊV1f?Ќ52b?Վhc1\?XᐻC2X?,ޕ1S?e4E2N?֊1H?҂K2@?Gѭ1:?szD22?g؞1+?%-62"?1?;.2?Rp1?r2?lǟ0?ѹ r1? ?\1?»8"?t*ŻJ1?Uƻڱ?0Ȼ/?BȻ_(?uǻ a?jŻLn??_k#?JtM?ؖ?ɲ:?!_?$?}xȲ?I ?-ww?$<?6z?%?MϹ:c?ϯH;(N1?Ǣ;Q?;5?<㜳?gN<?dJm|?]!>sw\{?']6>ևz?3M>ooy?df>YNw?G >Ѩ6u?z2>vcs?n>po?>m>#:l?K>鴮g?>"Qc?R>Q^?>5)1/2?7?O| 9?/?A?'?*I??fQ?A?%"W?s ?*YY ]?N? mib?>Tg?>Ѫk?4>LK-o?+>8ꌴ1r?$ߥ>%t?>'&v?0>JMx?r>o4z?YX>dsl{?)@>>Aj|?*>+f06}?c>˫j}?>DY~?&=xX~?=50S ?=>0?(J?Du=Ex?Hw=u(c?9a=:B?@= i?$=ճg?ͺ =fu?s<0?n<ȳa?<?#<@ϩ?xhTٺDZ??"+q?lCރ?W71?FhFG?+ut81?u?B2n?9*f?-u(2`?FH@[?:2U?(@P?CQ2K?>0H?P [2B? Lq41=?$ b27?YA|11?ec2,?Q4^ٕ1$?W᧻Fi|?d>7'u|?&Q2>ᄴ{?I>y?/Lb>Sw?M~>TVu?ZJ>~/Ys?g>릿3p?>ٴbl?ӌ>͛ٴfg?0>-#b?5>a¶\?? F\W?f ?5 +-?O#]7e?S>M`i?:>Y3~m?>}Ñ p?6ŭ>s?E>~v?>gf@w?8~>ﻃ~y?c>az?I>Rqt|?2>Q|?> ^ʣ}? >mg1~?=mB{~?= O~?=k'9?Y6=:l?,= ?Tk=J! ?;H=P ?*=x?3 =³~?1Fe!}?>UnN|?*G->zB} F{?lC>zy?\>uYx?xx> .Sv?m>s?w>*p?9:>ʹ$l?>>Ѵ_h?I>lb?>\\??U? ?dO?%m?,-(?|@?v 0?e9? |8?W1?⾏~%A?(?m:I???|ֳP? A?e W?B0 ?o]?*?mc?z>*Hg?E>Ek?>2o?>ԛr?z> I-u?J>ÎTw?>!*y?k>Iڑz?Q>牴{?9>zƳ|?<#>Ytu}?>Z\~?=@h~?|=~9~?==-K+?=b?3Ǎ=s3?:r=J!?I*O=F?U/=ڳ,?=|?2?Y=1?2?ypG1?»2?Ż1?ɻ|2?̻.?T9лAb2?ӻi^?׻[2?ۻ@?y޻682s?Ự ʱi?LK 2a?y^!]?1[?t軐U[?X1b?Yn?⻰w?ݻ?NֻWⱻ?w?̻?b(pJ?^,6H?陻 ?(9?`D?>s_?"?:ᄳ?:;D1?CU;G.?Y;v?(< ϵ?^LeDX}?>Efd|?U'>pCY{?N=>Yz?U>0x?!q>Wv?d>tt?>{q?->,ȴ m?u>fǴci?c>ᴓd?>@]?ʡ>{V?l ?Fu#O?{n?MiH?H?!$?'D?{+?i=?F4?~U5?s=?,?85E?a"?憎~M?QY?QӎdT?-?MEX[??3a?> ΫY1f?>Ւj?>ssn?J>qq?>59yt?a>ײ[v?h5>2v:x?1-s>픴;z?T"X>ĆU~{?J?>6u|?(>SJyO}?>|_}?n>Sq~?r=d~?=0 ?=KZ?>=q)?jx=<0g?S=$6?@3= ?_=dz?svt? ^1_?ΪK?Q62;?"k-?^MW2#?vD]H?ny2?⮻D ?6 2?.??괻P2?ĂЮ ? s2?󛹻zܯ?=42?U@Ν0?o龻ӟ2?@0?8Sû+2?Ż0 1?Ȼ 2?˻0?ٸλ7.2?;һ`m?0?Ӊջq2?ٻ`C? ܻ4r2w?>[ l?*\2_?1𑱱S?6껛32J?n컸7B?o 2??O=?y1??BH?s`0V?l? 2h?Pۻ겮?7л ??JKI?M[6?ʴ{hŲ?g9GZ?/Ժ ?:,@?z:3?Վd;&?Ͻ;g? < Ѵ?&=<*?+w^j|?>) {?5>JEz?M>7*Uy?.h>{w?Â>F=u?'>lr?>nIǴTn?ҩ>ǻj?J>ܴդe?E>7ִ_?U>'^X?}?0P?A? -tH?:?8A?'?y$ ?/G?Hc(?@?_y&T1?8?7y5_:?/?C?%?Q0K??p:oR?t?ۜ]nY? ?e|_?B>d?>gi?>Vim?־>CHp?L>`s?0>b.PTv?e>񭚴Tx?x>y?\>K{?uC>]Z|?H-,>{Z 3}? >)kQ}?)>cgb~?ܹ=cG~?=b?ͫ=&U?=:I?fU|=.t ?=V=>Y-?#`5=߳?I=?<㧶?s<:?1f,}?>y݀ZU|?,>GD{?D>zy?]>:x?_z>u7v?sߌ>\s? >_Dp?>~_Pl?>0شg?u>ɴ\a?:>MH[?m?j*ZaS?i?stJ?$?\B?&?Ps:?j/?J?I?ulf%?dC?Ё.?)N;?77?;2?-@?(?൴I?i?X bP? ?穴W? ?l0^?N>?嬴Zc?>})h?>%1l?@>dsp?>zs?(>٧v?'>?x?ʫ|>BSy?yO`>v7'{?]HF>w@|?ދ.>$ }?>h}?>ƀ[X~?[=z,w~?2=]8?0t=֫[R?Vה=G?F)~=@?W=A2?6=&?=@ϳ?_< ?<젳?F<?>܇?;)?;?sdN;]?":(e?B9t?8?J?Ű7w!?cն&y?k1Y?͒!<?Ԟج*2"?Ͳ?=hEf2?ZFg?>$ 2?/޾vo?ȿ2?ûH.?ŻJ42?u#ǻ`?RȻP 2?Oeɻ ?oʻi72?ԅ˻/?"̻"D2?λ@(?*ϻL2?Vѻk /?Rӻ2?ջ0?ػ2?ڻW/?ݻ2u?ເhmj?[2_?wR?2E?6F~V|?`#>{?y9>Бɉz?dR>5Ex?m>l}w?!>ܬئt?z>iq?ܨ>Xn?u>\Դi?y>ǴEd?>)T"]?>5]ߴV? ?;]N?}?RSE?#? |? *?ƯyG?t ?sO?m?߶V?g ?:]??b?X>֮ h?G>Pbl?>xέp?%>ఴ4s?ԟ>n u?/>(㥴w?~>]*y?3b>S{?EG>p5x3|?/>Е}?w>Qx}?S>w̓T~? i=VU~?=$%mM?= -0R?=O?5~=nA9?W=Е6?5=/?=5?Ca?滚Z2W?BJ?o2??,)rp1?bZ2&? {%۱?FU\g|2?vt' ?|"[2?V pa?Um`o#2?*?a1?@˲4?"0R?Ӊz?N߻Х?ѻ#?ľH$?V맻^Fg?ًۥ?S\o?ا^?5:9?v:q,?A[;oU? ;ǐc? Vg#}?>yD|?0.>pZ*{?XF>Ēy?`>'dx?}>Yu?%ю>a"s?a^4o?6> Ǵk?H>a˴(f?D> a?>ٴ%8Z??ݴbR?7?‰^I??+@?#")?zi6?b3?̉.?F;? F?b M?{#!?F?jc*?0>?K%4?75?:}=?_!,?:QF?!?ʶ}N?Q?U? ?ϴ\??M*>b?΀>g?P>4B!l?h>{Ĵo?[>Cs?8>ٸu?>jEw?L>˦y?گb>s{?H>__2|?/>_S}?>X}?.>iV~?}=x~?X=H#?:=aS?N=$ F?K|=H?U=?3=|-?=(ϳ?<?r<'?U.<<.?$?ͻh?9=c>4?ɡ2ჲy?J׃Id?!#?pӲ?{3кұ?@h?c:ǣw?;TN?;D?݁?iZo|?_">ϐ;{?9>Hx?1,n>ǣw?1e>tu~t?#>նq?W>sm?>̴{i?>Ѵ+d?a'> [۴d]?M>(QԴV? ?^N?|? ڴ8E?18#?pv;? V.?Y u1?CD8?̂|)?2???iN?Dô?pG?+])???"3?6?c$$g?"+>|uǴl?>Wo?.>\s?xf>nu?;V>˴cw?>KٟQy?#a>Kڣ{?'G>f=|?.>ʕ }?>z}?:>n]~?=\,~?=_r?=4W?=Zd?Mjy=Vs,?R=??_0=?泰?*=$H?<0m? h<4?CE<Fq?x<?o>D?-2:?|>/?CIa2#??- Λ2?0v?L2?P`I1? y2?9R|_w?0:2?SD ?SY1? D6l<?N10_?eN?ٻʪ?,ɻ޿3?NH? oWX?tث?'遳?P1Z?D:[?K:#;8??u;^&?N;!tF?.j:}?>>࣌l_|?+>iH{?C>qy?^>ޡ~,x?B{>%v?>emTWs?>Qp?ر>l;ʴ*l?9>ROʴ|Fg?5>Aٴ;a?>Z1մZ??)FS?א?δJ?h?8A?'?ﴕ?7?42?3ߴ_[-?^5fʹg?|>!El?d>{Ǵ[p?>ﷴZHs?H^>4繴2u?eD>x?|>Řy?V_>r _6{?E>zաtR|?8,>'F2}?Z">O5}?>Aoh~? = ~I~?=&C!?}̨=Ogn]?#*=>h Ҋ?t=:O?aN=3R?Z,=0?=̳?[<`(?m&<?k<0?moDqh]??x`k81,?8c+D?!( 2?m4»?|̻@p2?qԻl?etڻ~2|?޻EƲr?⻒t2j? \'f?von2b?$6Wb?i@2b?ed"c?86 a2c?廠&̱e?ҾL2e? 8"e?廄2c?廐b?a*2_? 绀g5[?&X2V?u!P? +2K?@MC?[2:?G`۰0?C[a1}? >^|?;>뚄{?4>>z?M>mDDy?Oi>f_w?%σ>e< u?3^>Oir?W>1ʴOn?9> Hj?^>(ִ+e?G/>/"ʹc%_?>,ᴿ(X?l' ?ִ2P??QG? ?}մ=?+?+q3?y6?)???ٴA!?F?:.?N?Ѵ ?H?FRū(?ړ@?N𼴬2?S7?´pĴ(h?>?hƴl?gG>%np?7ѯ>Ǵs?/>I-v?s>88Kx?\y>Ry?j\>5\\{?B>Mq|?*>"⛴1K}?r>or}?x>dx~?=Ve~?P=qh*?6=g.d?=l]\?n=` ذ?PAI=??'(=&?p1 =z&?A?;ճA?趜;?(\+;Y?K:X?UJ.??N1x?ĄZ0@?ۜY ?Mmp*1?%#&4?ژ̻x\2?Lֻ"?ڱݻ\72n?/㻜_?G2U?6P?[md2L?-N$L?[l"2L?x(YNM?컪D2P?Q Q?T*2T?\U?b黲Y2V?PԀW?默'2U?T?42Q?*껀3 N?0b2H?J B?2<?ۯ4?(2,?sm#?#N2?Lip?EF2 ?H?GJ2?@4M?ZH2?eߎ?m0 J2 ?FW?Tl28?XY?0?Upݻ?4Zλ?*IB=0?.Pt?\q,g?"Fwʸ?Y ي?4H ?:٣?p{; ??%9;Г³"?Q<́?Qdi@|?%>Ū+{?P==>ŘHz?l/W>x?s>qv?>"s t?ޜ>p?>@%m?>Դh?/>ȴ5c?S>״.\??^?n ִU?  ?sM??"δG`D?;$?^:?G/?7b0?m9?紗c&?vB?tY?Y_I?)8,?N?>ҴQ? bH?ʴM")?+@?ʹ<3?y6?iQʴu=?E,?(:ȴ[F?5!?ŪônN??ϴzV?z ?'۴:]?A??ϴ(c?Op>9ϴh?>jǴ'6m?>ƴp?~>RPt?>ވv?>-ԫqx?Ϡt>B=z?X>2r{?=> |?a&>Jej}?">(` ~?'=T:cZ~?D=~~?&»=-?5?ՠ=e-m?j=͖?g=JO?{C=6^?"=~0?./=#dz?<?ɯkrm3}?>e&S|?3,>|ǒm6{?E>. y?y_>| x?`}>Pu??ʎ>Z*v"s?D>0o?>δk?B2>GǴ7g?n>TӴoa?>ڴZ??TִqS? Q?5ʴK?v?IA?g1'?zɴr7?1?-?Q;?ܴ#?fD?0ܴz?aK? ?M?@״> ?G? B"*?H??IشO4?5?δ2>?LY+?ƀG?k ?㴻 P?)?̴.W? ?/=ݴ^?T>(Դd?>OҴi?aV>ϴgm?>Wrq?>'ϴ݈t?>sͪv?*ņ>j\x?n>Iz?R>&W{?9>h|?\!>ǎ}? >hPo)~?="h~?ۧ=$CS~?=2k4B?6=.+v?H=[?`=y?f;=%>E?}L=Jݳ?=-$?^~28?%,?ų2&?a•$?N2#? 6%?yܔ2*?O.?f22??x7?o2;?Է~/DZ@? .B2D?ɲ7E?2G?`&I? q2G?җBmE? 2D?@?2<?a6?h)2/?,_)?řd2"?:W?x2? l ?j2?cRS?2?")#?zD21?ih²E? 2b?W滺?Zܻ1?Wλ3?@#?~V5Cb?.8Z?6N\.Qn? ? Ig?ro:L?=}>;?;,H?<*Dɳ?7<u?^t<?<"r?XɊ|?>q |?kI3>z?BL> Zy?g>⥴~zw?>ʂ'u?pv>Kr?.F>)ƴn?c>vɼj?>2մe?>}д_?>K>ϴ3.Y??ޛдQ?? :I?p?y`E??)?Ѥu5?[#4?x㴝+?=?дl!?qF?Mܴx?ML?dx?LL?l!?qF?[ߴ+?=?9մt5?\#4?CɴE??)?Y:I?p?|ʴQ??M4.Y??Դ_?>K>+ڴe?>uӴj?>]дn?c>\ִKr?.F>p⬴'u?pv>Ǵ}zw?>@GZy?g>㭴z?BL>F |?jI3>p|?>ZX.}?~>XH~?=[~?U]=w?=>O?=.a?~=p?-W=qL?&+4=B[?u=O(0?^3;Hų?o:Ϸ?I~?`I1?QN\Mc?.#?Vq?R1?WλfF?Rܻ@G2c?WmQ"E?ֻ20?n#"?2?(\Ҳ?2?c?C2? v?2#?:$0*?t2/?6?hs2;?᝱@?T2D?06F? 2H?ԗI? q2H? `ʪF?)2D?ʲ`"@? [2=?Է𻡽/7?ni22??P\.?f/2*?O%?|ۤ2$? ,K*$?Ȏ2&?a$y-?\28?I?]iV$2_?3ڲ}?޻ 1?cӻ ?|Ż6?"0??lIm|?U Y?SAs?Ar?#T?:!?g;.?J;[,w?mx|?]!>ǔW{?9>7z?R>Wx?n>˳v?*ņ>݈t?>׵qq?>ugm?>ϴi?`V>ôd?>δ^?S>ٴ/W? ?9 ޴ P?)?ȀG?k ?B2>?KY+?=ŴO4?5?ϴ"*?H??I> ?G?.0ٴ ?M?z?aK?#?eD?-?Q;?"ݴr7?1?n崴A?g1'?&UѴK?w?qS? Q?wHմZ??˫ݴoa?>d״6g?n>-Ѵk?A2>ȴo?>վv"s?D>C̴u?@ʎ>  x?`}>bI y?z_>ӧm6{?E>/KS|?4,>б3}?>E{}?]>ii~?̡=W'~?V=uK!?=f$^?ُ=#?kt=dS]?"M=Q?l+=Y9?(=ӳ?ul< ?+<T?4ғM?&'커22L?UF컀J?T컻22I?@E?mO2C?tVL>?_)2<?U8?$27? 6?=827?qM=?!𻜓|2E?hҒQ?r께\52d?8 8z²|?;߻1?(ֻ|S?`ʻ{0?>ce"?˚Pⰱ[?a D?ihc? %m?x^Ʋ?-G9?|M;o?;R?;`P?<#̳?VT<]?Vq<?ɯJ!|?a&>r{?=>`B=z?X>px?Ϡt>ވv?>Et?>Op?~>饻%6m?>´h?>д(c?Np>Ƚ̴:]?A?5*ִ{V?z ?C ˴nN??[F?5!?~v=?E,?״<3?z6?N")?+@?ӴQ?bH?BѴ,?N?oҴ?Y_I?c&?uB?b0?m9?e ڴ:?G/?B(I`D?;$?לsM??]ӴU?  ?ƚ.\??^?ڴ5c?R>[ٴh?/>kӴ%m?>ɴp?>´s t?ߜ>v?>Jx?s>cHz?l/W> *{?P==>%%@|?%>^4Mn}?>n~?=X~?^=T7~?ï=6n^5?!=/l?5=:xYZ?w}h=. L?C= B?#=92泫?o}=5&?<*?8H?ݱ4|x?yY?"F-$s?jq͹0?+?/Ij?2Zλ11?]pݻBX?8{G27?񻽊?T02 ?d?m͘2?eDz?Z2?@Rp?BJ2 ?v?C2?RiH,$?#6:2,?V5?2<?֯pB?2H?J 2M?뻚^2Q?(@S?4g2V?@-V?:2V?`zU?b>2T?\ $.R?2O?R`M?컺2K?xȷL?b삝2M?/N8`)O?ZmBq2U?dp_?B,T2m?/z5?۱ݻD2?Lֻxݲ?ܘ̻\1?% ?Mm@@?ۜ?b.x?Ą1$?NU?䴜?J2?tK:?K\+;!A?ᶜ;6??;O?](<:u?a<5ܳ?đ<=}?kifx~?=:X}?x>BP0K}?p>3q|?*>\{?B>qy?j\>EKx?\y>-v?s> ls?/>bnp?7ѯ>l?fG>9ʴ(h?>ftb?3>Ѵ\?p?״VU? ?DʹF/N?}?E?¯"?8QpHⴍ+e?H/>״Hj?^>3ʹPn?9>Vôr?W>yd u?2^>ZZ_w?%σ>uDDy?Oi>姴z?M>){?4>|?;>p1}? >&n=~? =_4~?']= t?=c;I?}=h``{?9=?9U\=H&?8=p?t=K|.?<ʳg?`o=?+2o?0Ժg?fβ?1:?O;(?*(;?S(?i&<5?_< ?=I䳎?^,=]%?`N=! ъ?t=Cn]?"*=eX)!?̨=LZJ~?=KHh~? =5}?>hG2}?Z"> tuR|?7,>N`6{?E>0噴y?V_>~x?|>Y1u?eD>(m[Hs?H^>H\p?>T´El?e>*ƴ|b?>ȴl\?=0?yU?/ ?zԴM?l,?MҴjE?"?󫴏JRYߴ}Fg?6>Ѵ+l?:>ҴPp?ر>\ȴTWs?>ȴv?>w,x?B{>ùy?^>6ȚiH{?C>|@l_|?+>-:}?>>F}?j>(dg"k~?K=%jz~?&r=F ?=bdv\?ܗ=Y!?u= .N?O=g?-.=5<?ht=.8ֳ?b<([?q<+?Kŗ< I?j<$F?.<Ө&?N;D??^;ѳ?V:#;2?:?B1 ?'0?t$H?P??,ɻu]?ٻ"1_?5:?G32 ?D) ?S(2?UD5?0͚2?I"29?|ȱD?(12M?y>&|V?黦2]?`9d? 廬*2i?bhmn?e(2q?@pt?K޽2w?Aw?i໠2y?.6a#y?|d2x?߻y?߻פ2{?u߻{̱|?޻xB2?ݻ?ۻ ;2?dٻ`?ջG2?mл"l?4bʻ,12? Ѳ?.1 ?LvqK?~0hz?'?6Tu.?I||L?ms?F9|:}?:B?yjr;?;,;?? <({q?x<}س?AE<آ?h<{G?}<ϳ?*=Z?\0=h= }? >x|=|?.>5搴{?'G>Qy?$a>Gcw?>)mu?;V>pLs?xf>Uo?.>l?>/Ŵg?"+>찼-\b?*$>ⷴY\?~P?ϴPU?^/ ?MݺQM? ?E?"?t6ߴ$+d?a'>V۴{i?>aٴm?>nôq?W>Ŵ~t?#>smw?0e>2x?2,n>z?SR>ࡴ<{?9>Tt|?_"> w}? >n ~?1=ٗ~?i=9M~?,=^h9?%='Co? =P?V!g= ?B=ȿ:?#=5p?=#>?o<3ﺳ?OqW?Ý默25?r?"?U= F?O|=bH=S?N=7($?:=)T~?X= LV~?}=c|}?.>hdT}?>팴^2|?/>{?H>间y?دb>[Cw?J>0kٸu?>As?7>Ꮄo?[>)l!l?h>yg?O>v?b?π>\??0U? ?}N?Q?3ѴQF?!?oÕ}=?_!,?(%4?85?˴b*?0>?P #!?F?KF?a M?' ̉.?F;?;;h6?b3?K+@?#")? ^I?? bR?8?>9'8Z?? a?>۴(f? D> մk?I>sǴ3o?6>˴"s?u?&ю>7x?}> vy?`>Z*{?YF>D|?0.>v璴g#}?>x}?H_>zY~?/&=N~?x;=Fk?ѩ=8/R?!=GW/?~=?vX=:N?5=8H?=&N?<ͳ?=`?5=9?W=*N+?5~=A/R?=MGL?=i8~?=fT~? i=2 T}?S>ު}?w>]Vmx3|?/>CS{?DG>ڙy?3b>Fhw?~> u?/>_4s?ԟ>p?$>VEbl?> h?G>IUb?W>:]??SݶV?g ?sO?n?-yG?t ?hȴ>?à*?%5?X4?=,?d=?e"?iE?'4?K?lJ44?5?<{@۴Ed?>Ki?y>Ҵn?u>Jϴq?ܨ>7٦t?{>w?!>aFx?m>ʉz?dR>[{?y9>8&V|?b#>z}?$>~?1V=66V}~?(=x#n~?=42?=Vh?=?(5m=< ?H=6?(=S(P?͹ =ѳ?<?X < ?;ҕ<`a?ch̐b }?>>@|?ߋ.>Wy'{?]HF>Sy?xO`>x?ɫ|>uv?&>֛zs?(>ݥsp?>`x1l? @>*(h?>|[c?>atk0^?N>보W? ? aP? ?豴I?i?K @?(?'Õ7?;2?LЁ.?*N;?f%?dC?J?I?8s:?j/? \B?&?(J?$?tZaS?i? [?m?޴\a?:>3g?u>7Ѵ]Pl?>v״^Dp?>TǶs? >.4´v?sߌ>':x?_z>zy?]>0pD{?D>xZU|?,>s,}?>R]a}?>^Z~?=֟l~?=̖=?-=êRO?Te= }?=ST<?5[=a f?J9='c?*=XK ?=?V=N#?bU|=U?=>?ͫ=,~?=[gb~?ܹ=@QQ}?*>r 3}? >XhZ|?I-,>꧊K{?uC>cFy?\>}rTx?x>PTv?e>U`s?0>6Ip?L> im?־>i?>~d?>op|_?C>۠\nY? ?aեmR?t?0K??çC?%?}5_:?/? U%T1?8?mc(?@?ӆu ?/G?8A?'?>!.tH?:?]0P?A? X?}?_?U>OԤe?E>oҴj?J>b۴Un?ҩ>lr?>ô=u?&>pw?Â>4*Uy?.h>vz?M>t {?5>쐄|?>ą}}?Z >_&~?=jg~?U=oF^~?E=iV4?4=J'3i?$ۊ= -:Y?m= ѯ?ZJ=a$?>5+=eR?u=l\? <|?G<?#Ϝ[<?ܻ2?ٻhjx?͉ջ2?8һ??׸λeס2?˻}/?Ȼg2?Ż0?3Sû=02? P1?m龻9l2?R0?=.2?0? ~2?B/?@괻Y2?.ܰ ?6^_2?⮻8Ө?'<2"?qH-?\M2;?/?L?Q1_? u?0?{>s|Ŷ?BUz?0V?4&&?6I?Tцꚲ?Hg:<?z;L㲆?z;Ųf?;#?9= ?=QE~?=[9q~?r=^`}?n>WO}?>t~u|?(>jnT~{?J?>G;z?S"X>'9x?0-s>~œ[v?h5>ĭ:yt?`>q?>Jsn?J>kj?>JZ1f?>a?>ĘX[??ʒeT?-?S}M?QY?'猴E?a"?=?,?F4?~U5?ʁ+?i=?2$?'D?f0NiH?H?G$t#O?{n?U|V?l ?]?ʡ>>nd?>㴇ci?c> m?u>M´{q?,>jnʹtt?>XVv?d>ax?!q>LYz?U>՗X{?N=>y|?U'>gCX}?>b}? >Rjo~?=PI~?l=?R?M;=l ,[T?/!=0=Ԁ? =SѢ?XZ=G%?59=o? =Dg?%=X̳?6o2?U$?bC=yJ~? |=n@~?=oxau}?>XW_Ƴ|?<#>{?9>E}ّz?Q>oy?k>|GTw?>1r?z>o?>Vk?>򤴗g?E>lc?y>B§]?*?9W?B0 ?ŏճP? A?t:I???.~%A?(?"8?W1?Q 0?e9? (?|@? cO?$m?,U? ?Lj\??mb?>:_h?I>>$l?>Tشp?9:>>δs?w>@0Sv?m>tұvYx?xx> Vzy?\> F{?mC>|6N|?,G->|!}?>f]}?>$oaJ~?G=I:~?`=MP?D="G)0@?ۓ=ba;p? h=c ܕ?;i=7%T?j]G=*?l)= "?=ճ2?əNG2??z0@E?~Ӛ[.2L? %MS?#锻<2\?pg?t@t1s?LՆF??H S1?MoR?[@C?YB諑?6$L?x `Ȳ? 魺 I???t9^d?PY:g#?23;j?;GJ?;<?(9<~h?+ٺ^'?膺#@Ӳ?'蓹`?.:?6:?a<;>)d?΍;ժ?AE;TU?"K06}?d>jAj|? *>kl{?)@>x4z?YX>%k~Mx?r>&v?0>gt?> 1r?$ߥ>q-o?+>s2Ъk?4>g?>q1jb?>@x ]?N?R2"W?s ?_Q?B?)I??I8A?'?_Eh 9?/?e/2?7?gT^?>0Pc?R>g?>:l?K>tIo?>m>;gbs?m>ʴ5u?y2>POw?H >$oy?df>Nz?3M>]{?(]6>[|?^!>Tp(}}?>fhk ~?=fOL|~?T=+N~? (=:?ª=W:R?b=L}?H=/%_?L_=\ַ?+@=( M?;$=|?Z =&?{?r7=kc ?#pV=zC?y= x\?= &?A=0>)~? =6k~? ="@y~?=VD}?W>OJ|?ό!>kc"{?L6>mwz?L>с~y?'je>8jyw?>≴u?6 >…s?>9 p?>5B3m?>uDj?>焴e?<>ř_a?>u [??D'UzU?J ?Hn/N??υk;G? ?P5??)?#{8?\E1?pH4oh?\> "zl?> o?J>].r?K>شlu?>oϴow?>(#89y?3 j>3Wۜz?DP>y{?*':>â|?w%>dN{[}?N>w}?>uTd~?H=qOW~?=x@( ?9=yw:C?=(q?8=z(6?j=G'?fJ=?x.=? =T?<9ٳ?<᳽?7̰<?*<óK?2|j< ?;Q9,; fs?:4?*M??Ta,?u C䲓?mP p?qO? Dݲ4?g?7? ? XBm?Z?-.ؤ?76?爰?1 r??0|0?@Q*?Wy1f? 002l?+: 1p?'2t?4ש1x?󃻐y2|? 1?^ 1?{9Q1?tw 1?rc1?Wl1?eٯ?-^:G*1?~$UK?4`J0 /?{=Wͱ?L.HY? S*?vz?Fֺz?"R?3߬?^8p?i:A ?VI:wղ?w@;j?;PB?H;*<(? ;,=?ė> Y,}?>Qbye|?5+>mo{?x@>qCz?܉W>#x?`mp>4}"w?Y>Vu?͓>$r?X>do?ֲ>l?،>@h?f>fpd?>Alx_?&>ćy4Y?v?/}S?h-?Ω\L? ?]tBE?+#?(MUi>? *?~Fj?g>;el?j>w$do?x>r?a> t?>J\w?>9Oдx?ӕp>1⹴Ez?rdW>)q{?W@>7c|?\]+>&}?UW> }?A">)my?~?Y2=Nxc~?g/=őPF~?S=Df/?VF=7L`?=>/?K/y=Z!?`X=r8?;=Ҡ ? =a-?J =e?=p g?=(]=?=!fU?8=,?k=8Z~?XP=3F~~? =%P ~?=ȿZ$}?S>_f|?w!>0rS{?a'5>RO} z?J>sy?azb> $x?Z{>}Sv?ak>IU,t?ϙ>*Cq? >n?>Bk?>[Lg?{m>Pb?>/q]?2?ၴW? ?st:MQ?0h?lngJ?"?܇C?y$?w# End: Data binary 4 # End: Segment 3-3.11.1/doc/static/000077500000000000000000000000001503346766200140075ustar00rootroot000000000000003-3.11.1/doc/static/api310.html000066400000000000000000011626551503346766200157120ustar00rootroot00000000000000 mumax3

Warning! This is the API for mumax3.10, which is no longer supported. If you like to use mumax3, we strongly recommend to use mumax3.11.

mumax3
GPU-accelerated micromagnetism

Home Download Examples API Forum


Syntax

The mumax3 input syntax is a subset of Go's syntax, somewhat similar to C. It is case-independent however, so msat is the same as Msat or MSAT.

Defining variables

New variables are declared using :=. Variables have a fixed type, inferred from the declaration's right-hand-side. Assigning to existing variables is done using =. E.g.:
i := 7         // defines a new variable i, type automatically detected to be int
print(i)       // now we can use i
i = 5          // assign new value, don't use ':=' (attempt to re-declare)

str := "hello" // defines str, type automatically is string
//str = 1      // would fail, cannot assign int to string

Arithmetic

Most common arithmetic operations are possible. Also Go's math library and some common constants are available. For raise-to-the-power, pow(x,y) should be used.
x := pi*(3+4)/5
x = pow(x, 3)
x++
y := abs(cbrt(cosh(erf(erfc(gamma(J0(Y0(2))))))))

Control structures

Loops are possible as well:
for i:=0; i<10; i++{
	 print(i)
}

Implicit functions

Some of the API features accept a function as argument (e.g.: RunWhile(func()bool), or all input parameters). In that case, and only in this case, the argument is implicitly converted to a function, which is re-evaluated each time it's needed. E.g.:
value := sin(pi*t)  // value is a float64, RHS evaluated only once
Msat = value        // time-independent Msat
versus:
Msat = sin(pi*t)    // RHS converted to function, re-evaluted every time

Methods

Some of the API instances have methods defined on them. You can call methods on an instance by using '.' as in most object oriented programming languages. E.g.: a material parameter such as Msat has the method SetRegion(int, float) to set the value of the material parameter in a certain region:
Msat.SetRegion(1, 800e3) // Set Msat=520e3 in region 1 

Mesh size and geometry

The simulation mesh defines the size of the box around your magnet. It should be set at the beginning of the script. The number of cells should preferably be powers of two, or at least have small prime factors (2,3,5,7). E.g.:
Nx := 128
Ny := 64
Nz := 2
sizeX := 500e-9
sizeY := 250e-9
sizeZ := 10e-9
SetGridSize(Nx, Ny, Nz)
SetCellSize(sizeX/Nx, sizeY/Ny, sizeZ/Nz)

Periodic boundary conditions

Optionally, periodic boundary conditions can be enabled:
SetPBC(5, 0, 0)        // 5 extra images on left and right sides.
SetGridSize(128, 64, 1)
SetCellSize(5e-9, 5e-9, 5e-9)
Setting a nonzero PBC value in a direction enables wrap-around in that direction. The precise value passed determines how many repetitions are seen by the demag field. E.g., in the above example the demag field behaves as if 5 repetitions are present to the left and to the right side. Choosing a large number may cause long initialization time.

Resizing the mesh

The mesh can be changed at any later time in the simulation. This will cause the magnetization to be stretched onto the new mesh if needed, and the geometry and regions to be re-calculated. After resize some cells which had zero magnetization may now fall inside the magnet geometry, they will be initialized to random magnetization.

Setting the geometry

Optionally a magnet Shape other than the full simulation box can be specified. In order to set the geometry, you first need to define a shape.
 geometryShape := cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0)
SetGeom(geometryShape)

SetCellSize(float64, float64, float64)

Sets the X,Y,Z cell size in meters

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetGeom(Shape)

Sets the geometry to a given shape

examples: [4] [6] [7] [8] [9] [11] [12] [14]

SetGridSize(int, int, int)

Sets the number of cells for X,Y,Z

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetMesh(int, int, int, float64, float64, float64, int, int, int)

Sets GridSize, CellSize and PBC at the same time

SetPBC(int, int, int)

Sets the number of repetitions in X,Y,Z to create periodic boundary conditions. The number of repetitions determines the cutoff range for the demagnetization.

EdgeSmooth

Geometry edge smoothing with edgeSmooth^3 samples per cell, 0=staircase, ~8=very smooth

examples: [4]

Shapes

A shape is an abstract object which outlines an area in a 3D universe. Shapes are useful for different tasks, e.g.: to define the geometry of a magnet, to define material regions, or to set locally a specific initial magnetization configuration. One can specify primitive shapes, constructed at the origin (box center), and translate/rotate them if needed. All positions are specified in meters and the origin lies in the center of the simulation box. E.g.:
myShape := cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0))
anotherShape := Circle(400e-9).sub(Circle(200e-9))

Cell(int, int, int) Shape

Single cell with given integer index (i, j, k)

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Circle(float64) Shape

2D Circle with diameter in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [7] [8] [12]

Cone(float64, float64) Shape

3D Cone with diameter and height in meter. The top of the cone points in the +z direction.

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

Cuboid(float64, float64, float64) Shape

Cuboid with sides in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Cylinder(float64, float64) Shape

3D Cylinder with diameter and height in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [5] [6]

Ellipse(float64, float64) Shape

2D Ellipse with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [14]

Ellipsoid(float64, float64, float64) Shape

3D Ellipsoid with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

GrainRoughness(float64, float64, float64, int) Shape

Grainy surface with different heights per grain with a typical grain size (first argument), minimal height (second argument), and maximal height (third argument). The last argument is a seed for the random number generator.

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

ImageShape(string) Shape

Use black/white image as shape

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Layer(int) Shape

Single layer (along z), by integer index starting from 0

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [13] [14]

Layers(int, int) Shape

Part of space between cell layer1 (inclusive) and layer2 (exclusive), in integer indices

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Rect(float64, float64) Shape

2D rectangle with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [9] [11] [12] [15]

Square(float64) Shape

2D square with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [6]

Universe() Shape

Entire space

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

XRange(float64, float64) Shape

Part of space between x1 (inclusive) and x2 (exclusive), in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [7]

YRange(float64, float64) Shape

Part of space between y1 (inclusive) and y2 (exclusive), in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

ZRange(float64, float64) Shape

Part of space between z1 (inclusive) and z2 (exclusive), in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

Material regions

Optionally, up to 256 material regions can be defined. Since each cell is made from one material, it is associated with exactly one region. So regions can not overlap. Each cell is assigned material region 0 by default. It's a good idea to output regions to verify whether each cell is assigned to the intended region. Each region can have its own material parameters, and we can output averages over each region. E.g.:
DefRegion(1, circle(1e-6))
DefRegion(0, circle(1e-6).Inverse()) // redundant
save(regions)
Msat.SetRegion(1, 800e6)
tableAdd(m.Region(1))    // add average m over region 1 to table

DefRegion(int, Shape)

Define a material region with given index (0-255) and shape

examples: [7] [12] [13]

DefRegionCell(int, int, int, int)

Set a material region (first argument) in one cell by the index of the cell (last three arguments)

regions

Outputs the region index for each cell

methods: Average( )   EvalTo( Slice )   GetCell( int int int )   Gpu( )   HostArray( )   HostList( )   LoadFile( string )   SetCell( int int int int )  

examples: [7] [12]

Initial magnetization

The initial magnetization is set by assigning a Config to m, setting it in separate regions, or by loading a file directly.
m = uniform(1, 0, 0)
m.SetRegion(1, vortex(1, 1))
m.LoadFile("config.ovf")
m.SetInShape(circle(50e-9), uniform(0,0,1))

m

Reduced magnetization (unit length)

methods: Average( )   Buffer( )   Comp( int )   EvalTo( Slice )   GetCell( int int int )   LoadFile( string )   Quantity( )   Region( int )   Set( Config )   SetArray( Slice )   SetCell( int int int data.Vector )   SetInShape( Shape Config )   SetRegion( int Config )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Antivortex(int, int) Config

Antivortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

BlochSkyrmion(int, int) Config

Bloch skyrmion magnetization with given chirality and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

Conical(data.Vector, data.Vector, float64) Config

Conical state for given wave vector, cone direction, and cone angle

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

Helical(data.Vector) Config

Helical state for given wave vector

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

NeelSkyrmion(int, int) Config

Néél skyrmion magnetization with given charge and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

RandomMag() Config

Random magnetization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [3] [5]

RandomMagSeed(int) Config

Random magnetization with given seed

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

TwoDomain(float64, float64, float64, float64, float64, float64, float64, float64, float64) Config

Twodomain magnetization with with given magnetization in left domain, wall, and right domain

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [10] [11]

Uniform(float64, float64, float64) Config

Uniform magnetization in given direction

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [1] [2] [5] [6] [7] [13] [14] [15]

Vortex(int, int) Config

Vortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [8] [9] [12]

VortexWall(float64, float64, int, int) Config

Vortex wall magnetization with given mx in left and right domain and core circulation and polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

Material parameters

Assigning to a material parameter sets a value in all regions. E.g.:
Msat  = 800e3
AnisU = vector(1, 0, 0)
When regions are defined, they can also be set region-wise:
Msat.SetRegion(0, 800e3)
Msat.SetRegion(1, 540e3)
Material parameters can be functions of time as well. E.g.:
f := 500e6
Ku1 = 500 * sin(2*pi*f*t)

Aex

Exchange stiffness (J/m)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

alpha

Landau-Lifshitz damping constant

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [6] [7] [8] [10] [11] [12] [14] [15]

anisC1

Cubic anisotropy direction #1

methods: Average( )   Comp( int )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisC2

Cubic anisotropy direction #2

methods: Average( )   Comp( int )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisU

Uniaxial anisotropy direction

methods: Average( )   Comp( int )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [7] [10] [15]

B1

First magneto-elastic coupling constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

B2

Second magneto-elastic coupling constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Dbulk

Bulk Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Dind

Interfacial Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

EpsilonPrime

Slonczewski secondairy STT term ε'

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

FreeLayerThickness

Slonczewski free layer thickness (if set to zero (default), then the thickness will be deduced from the mesh size) (m)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

frozenspins

Defines spins that should be fixed

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Kc1

1st order cubic anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [12]

Kc2

2nd order cubic anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Kc3

3rd order cubic anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Ku1

1st order uniaxial anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [7] [10] [15]

Ku2

2nd order uniaxial anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Lambda

Slonczewski Λ parameter

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

Msat

Saturation magnetization (A/m)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

NoDemagSpins

Disable magnetostatic interaction per region (default=0, set to 1 to disable). E.g.: NoDemagSpins.SetRegion(5, 1) disables the magnetostatic interaction in region 5.

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Pol

Electrical current polarization

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [5] [10] [11] [14]

Temp

Temperature (K)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

xi

Non-adiabaticity of spin-transfer-torque

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [10] [11] [12]

Excitation

Field or current excitations can be set in the same way as material parameters:
B_ext = vector(0.01, 1e-6*sin(2*pi*f*t), 0)
B_ext.SetRegion(1, vector(0, 0, 0.1))
Additionally, an arbitrary number of time- and space-dependent vector fields of the form g(x,y,z) * f(t) may be added. (E.g., to simulate the field of an antenna or an arbitrary current running through the magnet)
B_ext.Add(LoadFile("antenna.ovf"), sin(2*pi*f*t))
J.Add(LoadFile("current.ovf"), 1)

B_ext

Externally applied field (T)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [1] [3] [15]

FixedLayer

Slonczewski fixed layer polarization

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [14]

J

Electrical current density (A/m2)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [10] [11] [12] [13] [14] [15]

Spin currents

The effect of spin-polarized currents on the magnetization dynamics can be modelled in different ways. In Mumax3 you can use the Zhang-Li model or the Slonczewski model. For both models, a spin-polarized current field needs to be defined. This is done by setting the current density field J and the polarization Pol.

Zhang-Li model

When using the the Zhang-Li model, it is possible to set the non-adiabaticity through the material parameter xi:
J = vector(1e12, 0, 0)
Pol = 1
xi = 0.1

Slonczewski model

To use the Slonczewski model, you need to define the magnetization configuration of the fixed layer. This fixed layer can be placed above or below the sample. The Slonczewski parameter and the prefactor of the secondary spin transfer torque term of the Slonczewski model can be set through the material parameters Lambda and EpsilonPrime respectively:
DisableZhangLiTorque = true
J = vector(1e12, 0, 0)
Pol = 0.6
FixedLayer = vector(1,0,0)
FixedLayerPosition = FIXEDLAYER_TOP
EpsilonPrime = 0.02
Lambda = 1

DisableSlonczewskiTorque

Disables Slonczewski torque (default=false)

DisableZhangLiTorque

Disables Zhang-Li torque (default=false)

EpsilonPrime

Slonczewski secondairy STT term ε'

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

FixedLayer

Slonczewski fixed layer polarization

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [14]

FIXEDLAYER_BOTTOM

FixedLayerPosition = FIXEDLAYER_BOTTOM instructs mumax3 that fixed layer is underneath of the free layer

FIXEDLAYER_TOP

FixedLayerPosition = FIXEDLAYER_TOP instructs mumax3 that fixed layer is on top of the free layer

FixedLayerPosition

Position of the fixed layer: FIXEDLAYER_TOP, FIXEDLAYER_BOTTOM (default=FIXEDLAYER_TOP)

FreeLayerThickness

Slonczewski free layer thickness (if set to zero (default), then the thickness will be deduced from the mesh size) (m)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

J

Electrical current density (A/m2)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [10] [11] [12] [13] [14] [15]

Lambda

Slonczewski Λ parameter

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

Pol

Electrical current polarization

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [5] [10] [11] [14]

xi

Non-adiabaticity of spin-transfer-torque

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [10] [11] [12]

Magnetic Force Microscopy

Mumax3 has built-in generation of MFM images from a 2D magnetization. The MFM tip lift can be freely chosen. By default the tip magnetization is modeled as a point monopole at the apex. This is sufficient for most situations. Nevertheless, it is also possible to model partially magnetized tips by setting MFMDipole to the magnetized portion of the tip, in meters. E.g., if only the first 20nm of the tip is (vertically) magnetized, set MFMDipole=20e-9.

MFM

MFM image (arb.)

methods: Average( )   EvalTo( Slice )   Region( int )  

examples: [9]

MFMDipole

Height of vertically magnetized part of MFM tip

MFMLift

MFM lift height

examples: [9]

Output quantities

The quantities listed below can be output. Also, derived quantities can be produced: the quantity restricted to a certain region or a single component. E.g.:
m           // magnetization quantity
m.Comp(0)   // x-component
m.Region(1) // magnetization in region 1 (0 elsewhere)

B_anis

Anisotropy field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_custom

User-defined field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_demag

Magnetostatic field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_eff

Effective field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_exch

Exchange field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_mel

Magneto-elastic filed (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_therm

Thermal field (T)

methods: AddTo( Slice )   EvalTo( Slice )  

DindCoupling

Average DMI coupling with neighbors (arb.)

methods: Average( )   EvalTo( Slice )   Region( int )  

dt

Time Step (s)

methods: Average( )   EvalTo( Slice )   Get( )  

E_anis

total anisotropy energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_custom

total energy of user-defined field (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_demag

Magnetostatic energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_exch

Total exchange energy (including the DMI energy) (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_mel

Magneto-elastic energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_therm

Thermal energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_total

total energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

examples: [13]

E_Zeeman

Zeeman energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

Edens_anis

Anisotropy energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_custom

Energy density of user-defined field. (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_demag

Magnetostatic energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_exch

Total exchange energy density (including the DMI energy density) (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_mel

Magneto-elastic energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_therm

Thermal energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_total

Total energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_Zeeman

Zeeman energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

ExchCoupling

Average exchange coupling with neighbors (arb.)

methods: Average( )   EvalTo( Slice )   Region( int )  

examples: [12]

F_mel

Magneto-elastic force density (N/m3)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

geom

Cell fill fraction (0..1)

methods: Average( )   EvalTo( Slice )   Gpu( )  

examples: [4] [6] [7] [8] [9] [11] [12] [14]

LastErr

Error of last step

methods: Average( )   EvalTo( Slice )   Get( )  

LLtorque

Landau-Lifshitz torque/γ0 (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

m_full

Unnormalized magnetization (A/m)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

MaxAngle

maximum angle between neighboring spins (rad)

methods: Average( )   EvalTo( Slice )   Get( )  

maxTorque

Maximum torque/γ0, over all cells (T)

methods: Average( )   EvalTo( Slice )   Get( )  

MFM

MFM image (arb.)

methods: Average( )   EvalTo( Slice )   Region( int )  

examples: [9]

NEval

Total number of torque evaluations

methods: Average( )   EvalTo( Slice )   Get( )  

PeakErr

Overall maxium error per step

methods: Average( )   EvalTo( Slice )   Get( )  

spinAngle

Angle between neighboring spins (rad)

methods: Average( )   EvalTo( Slice )   Region( int )  

STTorque

Spin-transfer torque/γ0 (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

torque

Total torque/γ0 (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

Slicing and dicing output

To save storage space, it's possible to save only the part of the output we're interested in. This works on all output quantities (not only m)
save(m)                         // save full magnetization
save(m.Comp(0))                 // save only x-component
save(CropLayer(m, 13))          // save only layer 13
save(CropLayer(m.Comp(0), 13))  // save only x-component of layer 13
Or even:
mx   := m.Comp(0)
mx13 := CropLayer(mx, 13) 
save(mx13)
tableAdd(mx13)

Crop(Quantity, int, int, int, int, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[, [y1,y2[, [z1,z2[

methods: Average( )   EvalTo( Slice )  

examples: [8]

CropLayer(Quantity, int) *cropped

Crops a quantity to a single layer

methods: Average( )   EvalTo( Slice )  

CropRegion(Quantity, int) *cropped

Crops a quantity to a region

methods: Average( )   EvalTo( Slice )  

CropX(Quantity, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[

methods: Average( )   EvalTo( Slice )  

CropY(Quantity, int, int) *cropped

Crops a quantity to cell ranges [y1,y2[

methods: Average( )   EvalTo( Slice )  

examples: [8]

CropZ(Quantity, int, int) *cropped

Crops a quantity to cell ranges [z1,z2[

methods: Average( )   EvalTo( Slice )  

Scheduling output

All input and output quantities (as described above) can be saved in a space-dependent way (".ovf" file), or as spatial averages (table output). The data table ("table.txt") contains by default the time and average magnetization. More columns can be added with TableAdd().
save(B_ext)

tableadd(B_ext)
tablesave()
Optionally, the output/averaging can be done over a single region:
save(m.Region(1))
TableAdd(m.Region(1)) 
User-defined variables can be added to the table with TableAddVar().
myField := 0.42
TableAddVar(myField, "B_extra", "T")
myField = ...

AutoSave(Quantity, float64)

Auto save space-dependent quantity every period (s).

examples: [1] [10] [11] [14] [15]

AutoSnapshot(Quantity, float64)

Auto save image of quantity every period (s).

DUMP

OutputFormat = DUMP sets text DUMP output

FilenameFormat

printf formatting string for output filenames.

Flush()

Flush all pending output to disk.

Fprintln(string, ...interface {})

Print to file

OutputFormat

Format for data files: OVF1_TEXT, OVF1_BINARY, OVF2_TEXT or OVF2_BINARY

OVF1_BINARY

OutputFormat = OVF1_BINARY sets binary OVF1 output

OVF1_TEXT

OutputFormat = OVF1_TEXT sets text OVF1 output

OVF2_BINARY

OutputFormat = OVF2_BINARY sets binary OVF2 output

OVF2_TEXT

OutputFormat = OVF2_TEXT sets text OVF2 output

Print(...interface {})

Print to standard output

examples: [2]

Save(Quantity)

Save space-dependent quantity once, with auto filename

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SaveAs(Quantity, string)

Save space-dependent quantity with custom filename

examples: [4] [5] [7] [9]

Snapshot(Quantity)

Save image of quantity

SnapshotAs(Quantity, string)

Save image of quantity with custom filename

SnapshotFormat

Image format for snapshots: jpg, png or gif.

sprint(...interface {}) string

Print all arguments to string with automatic formatting

sprintf(string, ...interface {}) string

Print to string with C-style formatting.

TableAdd(Quantity)

Add quantity as a column to the data table.

examples: [3] [11] [13]

TableAddVar(ScalarFunction, string, string)

Add user-defined variable + name + unit to data table.

TableAutoSave(float64)

Auto-save the data table every period (s). Zero disables save.

examples: [1] [11] [14]

TablePrint(...interface {})

Print anyting in the data table

TableSave()

Save the data table right now (appends one line).

examples: [3] [13]

Running

Run(time) runs the simulation for a given time in seconds, using sensible error settings.
Run(1e-9)
More fine-grained control is provided by RunWhile(condition), which runs as long as an arbitrary condition is met. E.g.:
mx := m.comp(0)
RunWhile(mx.average() < 0)   // search for switching field during reversal
Optionally, the solver accuracy may be fine-tuned. E.g.:
MaxDt = 1e-12
MinDt = 1e-15
MaxErr = 1e-6
Optionally, a different solver may be chosen (at any point) with SetSolver(int). Currently available solver types:
  • 6: RK56 (Fehlberg) solver. This is the highest order solver available, but which is typically not faster than the RK45 solver.
  • 5: RK45 (Dormand-Prince) solver (the default). An accurate solver, very fast for magnetization dynamics at the cost of some memory usage.
  • 4: Classical 4th-order Runge-Kutta method. Intended for simulations where a fixed, relatively large time step is desired.
  • 3: RK23 (Bogacki-Shampine) solver. A robust and reasonably fast solver with low memory requirements. Typically outperforms RK45 when relaxing the magnetization with little dynamics, so it used internally by Relax().
  • 2: Adaptive Heun solver. Robust and uses very little memory but takes smaller time steps than the higher-order solvers. Also suited when a fixed, relatively small time step is desired.
  • 1: Euler solver (requires FixDt = ..., ignores other settings). Only useful in exceptional situations or for debugging.
E.g.:
SetSolver(2) // Heun
FixDt = 1e-15

Relax

Relax() tries to evolve the magnetization as closely as possible to the minimum energy state. This function assumes all excitations have been turned off (temperature, electrical current, time-dependent magnetic fields). During relax precession is disabled and the time t does not increase. There is no need to set high damping.

In general it is difficult to be sure the minimum energy state has been truly reached. Hence, relax may occasionally return after the energy has reached a local minimum, a saddle point, or a rather flat valley in the energy landscape.

Minimize

Minimize() is like Relax, but uses the conjugate gradient method to find the energy minimum. It is usually much faster than Relax, but is a bit less robust against divergence. E.g., a random starting configuration can be Relaxed, but may fail with Minimize. Minimize is very well suited for hysteresis calculations, where we are never far away from the ground state.


Minimize()

Use steepest conjugate gradient method to minimize the total energy

examples: [3] [6]

Relax()

Try to minimize the total energy

examples: [1] [2] [3] [9] [10] [11]

Run(float64)

Run the simulation for a time in seconds

examples: [1] [7] [10] [11] [12] [14] [15]

RunWhile(func() bool)

Run while condition function is true

Steps(int)

Run the simulation for a number of time steps

dt

Time Step (s)

methods: Average( )   EvalTo( Slice )   Get( )  

FixDt

Set a fixed time step, 0 disables fixed step (which is the default)

Headroom

Solver headroom (default = 0.8)

LastErr

Error of last step

methods: Average( )   EvalTo( Slice )   Get( )  

MaxDt

Maximum time step the solver can take (s)

MaxErr

Maximum error per step the solver can tolerate (default = 1e-5)

MinDt

Minimum time step the solver can take (s)

MinimizerSamples

Number of max dM to collect for Minimize convergence check.

MinimizerStop

Stopping max dM for Minimize

examples: [3]

NEval

Total number of torque evaluations

methods: Average( )   EvalTo( Slice )   Get( )  

PeakErr

Overall maxium error per step

methods: Average( )   EvalTo( Slice )   Get( )  

RelaxTorqueThreshold

MaxTorque threshold for relax(). If set to -1 (default), relax() will stop when the average torque is steady or increasing.

step

Total number of time steps taken

examples: [3] [10]

t

Total simulated time (s)

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetSolver(int)

Set solver type. 1:Euler, 2:Heun, 3:Bogaki-Shampine, 4: Runge-Kutta (RK45), 5: Dormand-Prince, 6: Fehlberg, -1: Backward Euler

Moving simulation window

Mumax3 can automatically shift the magnetization so that the simulation "window" stays centered on a region of interest. Shifting is done to keep a freely chosen magnetization component nearly zero. E.g.
ext_centerwall(0)
ext_rmSurfaceCharge(0, -1, 1)
TableAdd(TotalShift)
will try to keep mx (component 0, counting from 0) close to zero. If desired, one can override which "new" magnetization is inserted from the sides by setting ShiftMagL and ShiftMagR, though the default behaviour is usually OK.

Shift(int)

Shifts the simulation by +1/-1 cells along X

examples: [15]

ShiftGeom

Whether Shift() acts on geometry

ShiftM

Whether Shift() acts on magnetization

examples: [15]

ShiftMagD

Upon shift, insert this magnetization from the bottom

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftMagL

Upon shift, insert this magnetization from the left

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftMagR

Upon shift, insert this magnetization from the right

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

ShiftMagU

Upon shift, insert this magnetization from the top

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftRegions

Whether Shift() acts on regions

TotalShift

Amount by which the simulation has been shifted (m).

Extensions

Extensions are extra functionalities that are not officially supported. They are aimed at rather specific problems and may not work as expected for your particular situation. Their API and functionality may change in future releases.

ext_bubbledist

Bubble traveled distance (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_BubbleMz

Center magnetization 1.0 or -1.0 (default = 1.0)

ext_bubblepos

Bubble core position (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_bubblespeed

Bubble velocity (m/s)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_centerBubble()

centerBubble shifts m after each step to keep the bubble position close to the center of the window

ext_centerWall(int)

centerWall(c) shifts m after each step to keep m_c close to zero

examples: [10] [11]

ext_corepos

Vortex core position (x,y) + polarization (z) (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_dwpos

Position of the simulation window while following a domain wall (m)

methods: Average( )   EvalTo( Slice )   Get( )  

examples: [11]

ext_dwspeed

Speed of the simulation window while following a domain wall (m/s)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_dwtilt

PMA domain wall tilt (rad)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_dwxpos

Position of the simulation window while following a domain wall (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_EnableUnsafe()

Deprecated. Only here to ensure maximal backwards compatibility with mumax3.9c.

ext_InterDind(int, int, float64)

Sets Dind coupling between two regions.

ext_InterExchange(int, int, float64)

Sets exchange coupling between two regions.

ext_make3dgrains(float64, int, int, Shape, int)

3D Voronoi tesselation over shape (grain size, starting region number, num regions, shape, seed)

ext_makegrains(float64, int, int)

Voronoi tesselation (grain size, num regions)

examples: [12] [15]

ext_phi

Azimuthal angle (rad)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_rmSurfaceCharge(int, float64, float64)

Compensate magnetic charges on the left and right sides of an in-plane magnetized wire. Arguments: region, mx on left and right side, resp.

examples: [11]

ext_ScaleDind(int, int, float64)

Re-scales Dind coupling between two regions.

ext_ScaleExchange(int, int, float64)

Re-scales exchange coupling between two regions.

examples: [12] [13] [15]

ext_theta

Polar angle (rad)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_topologicalcharge

2D topological charge

methods: Average( )   EvalTo( Slice )   Get( )  

ext_topologicalchargedensity

2D topological charge density m·(∂m/∂x ✕ ∂m/∂y) (1/m2)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_topologicalchargedensitylattice

2D topological charge density according to Berg and Lüscher (1/m2)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_topologicalchargelattice

2D topological charge according to Berg and Lüscher

methods: Average( )   EvalTo( Slice )   Get( )  

Custom quantities

Using existing quantities, it is possible to define new custom quantities. E.g.: instead of using the pre-defined ext_topologicalchargedensity quantity, it is possible to define this quantity yourselves inside an input script:
cs := 1e-9
setcellsize(cs,cs,cs)
setgridsize(64,64,1)

// Use central finite differences to approximate the spatial derivatives of m
mL := Shifted(m,-1,0,0) // shift left
mR := Shifted(m,1,0,0)  // shift right
mD := Shifted(m,0,-1,0) // shift up
mU := Shifted(m,0,1,0)  // shift down
dmdx := Mul( Const(1/(2*cs)), Madd(mR,mL,1,-1) )
dmdy := Mul( Const(1/(2*cs)), Madd(mU,mD,1,-1) ) 

// Define the topological charge density
chargeDensity := Mul( Const(1/(4*pi)), Dot(m, Cross(dmdx,dmdy)))

// Save the topological charge density of a skyrmion
m = neelskyrmion(1,-1)
saveas(chargeDensity, "chargeDensity.ovf")

Add(Quantity, Quantity) Quantity

Add two quantities

methods: EvalTo( )  

examples: [3] [4] [5] [11] [13] [15]

Const(float64) Quantity

Constant, uniform number

methods: EvalTo( )  

ConstVector(float64, float64, float64) Quantity

Constant, uniform vector

methods: EvalTo( )  

Cross(Quantity, Quantity) Quantity

Cross product of two vector quantities

methods: EvalTo( )  

examples: [12]

Div(Quantity, Quantity) Quantity

Point-wise division of two quantities

methods: EvalTo( )  

Dot(Quantity, Quantity) Quantity

Dot product of two vector quantities

methods: EvalTo( )  

Madd(Quantity, Quantity, float64, float64) *mAddition

Weighted addition: Madd(Q1,Q2,c1,c2) = c1*Q1 + c2*Q2

methods: EvalTo( Slice )  

Masked(Quantity, Shape) Quantity

Mask quantity with shape

methods: EvalTo( )  

Mul(Quantity, Quantity) Quantity

Point-wise product of two quantities

methods: EvalTo( )  

examples: [10] [11]

MulMV(Quantity, Quantity, Quantity, Quantity) Quantity

Matrix-Vector product: MulMV(AX, AY, AZ, m) = (AX·m, AY·m, AZ·m). The arguments Ax, Ay, Az and m are quantities with 3 componets.

methods: EvalTo( )  

Shifted(Quantity, int, int, int) Quantity

Shifted quantity

methods: EvalTo( )  

Custom effective field terms

It is possible to define additional effective field terms by promoting a custom quantity to an effective field term. The corresponding energy density term can also be added by promoting a custom quantity. E.g.: instead of using the existing anistropy field in mumax3, you could define the uniaxial anisotropy field (and the corresponding energy density) yourselves:

Ms := 1100e3
K  := 0.5e6
u  := ConstVector(1, 0, 0)
anisField := Mul( Const(2*K/Ms)  , Mul( Dot(u, m), u))
anisEdens := Mul( Const(-0.5*Ms) , Dot( anisField, m))

AddFieldTerm(anisField) // promote anisField to an effective field term
AddEdensTerm(anisEdens) // promote anisEdens to an energy density term

tableAdd(E_custom)  // Add a column with the energy related to the custom field

AddEdensTerm(Quantity)

Add an expression to Edens.

AddFieldTerm(Quantity)

Add an expression to B_eff.

B_custom

User-defined field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

E_custom

total energy of user-defined field (J)

methods: Average( )   EvalTo( Slice )   Get( )  

Edens_custom

Energy density of user-defined field. (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

RemoveCustomFields()

Removes all custom fields again

Misc

Other available functions.

abs(float64) float64

Abs returns the absolute value of x.

acos(float64) float64

Acos returns the arccosine, in radians, of x.

acosh(float64) float64

Acosh returns the inverse hyperbolic cosine of x.

asin(float64) float64

Asin returns the arcsine, in radians, of x.

asinh(float64) float64

Asinh returns the inverse hyperbolic sine of x.

atan(float64) float64

Atan returns the arctangent, in radians, of x.

atan2(float64, float64) float64

Atan2 returns the arc tangent of y/x, using the signs of the two to determine the quadrant of the return value.

atanh(float64) float64

Atanh returns the inverse hyperbolic tangent of x.

cbrt(float64) float64

Cbrt returns the cube root of x.

ceil(float64) float64

Ceil returns the least integer value greater than or equal to x.

cos(float64) float64

Cos returns the cosine of the radian argument x.

examples: [6] [13] [14]

cosh(float64) float64

Cosh returns the hyperbolic cosine of x.

DemagAccuracy

Controls accuracy of demag kernel

DoPrecess

Enables LL precession (default=true)

EnableDemag

Enables/disables demag (default=true)

erf(float64) float64

Erf returns the error function of x.

erfc(float64) float64

Erfc returns the complementary error function of x.

Exit()

Exit from the program

exp(float64) float64

Exp returns e**x, the base-e exponential of x.

examples: [15]

exp2(float64) float64

Exp2 returns 2**x, the base-2 exponential of x.

Expect(string, float64, float64, float64)

Used for automated tests: checks if a value is close enough to the expected value

ExpectV(string, data.Vector, data.Vector, float64)

Used for automated tests: checks if a vector is close enough to the expected value

expm1(float64) float64

Expm1 returns e**x - 1, the base-e exponential of x minus 1. It is more accurate than Exp(x) - 1 when x is near zero.

exx

exx component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

exy

exy component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

exz

exz component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

eyy

eyy component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

eyz

eyz component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

ezz

ezz component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

false

floor(float64) float64

Floor returns the greatest integer value less than or equal to x.

gamma(float64) float64

Gamma returns the Gamma function of x.

GammaLL

Gyromagnetic ratio in rad/Ts

heaviside(float64) float64

Returns 1 if x>0, 0 if x<0, and 0.5 if x==0

hypot(float64, float64) float64

Hypot returns Sqrt(p*p + q*q), taking care to avoid unnecessary overflow and underflow.

ilogb(float64) int

Ilogb returns the binary exponent of x as an integer.

examples: [2]

Index2Coord(int, int, int) data.Vector

Convert cell index to x,y,z coordinate in meter

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

inf

Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.

examples: [4] [7] [11]

isInf(float64, int) bool

IsInf reports whether f is an infinity, according to sign. If sign > 0, IsInf reports whether f is positive infinity. If sign < 0, IsInf reports

isNaN(float64) bool

IsNaN reports whether f is an IEEE 754 “not-a-number” value.

j0(float64) float64

J0 returns the order-zero Bessel function of the first kind.

j1(float64) float64

J1 returns the order-one Bessel function of the first kind.

jn(int, float64) float64

Jn returns the order-n Bessel function of the first kind.

ldexp(float64, int) float64

Ldexp is the inverse of Frexp. It returns frac × 2**exp.

LoadFile(string) Slice

Load a data file (ovf or dump)

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [5]

log(float64) float64

Log returns the natural logarithm of x.

examples: [2] [4]

log10(float64) float64

Log10 returns the decimal logarithm of x. The special cases are the same as for Log.

log1p(float64) float64

Log1p returns the natural logarithm of 1 plus its argument x. It is more accurate than Log(1 + x) when x is near zero.

log2(float64) float64

Log2 returns the binary logarithm of x. The special cases are the same as for Log.

logb(float64) float64

Logb returns the binary exponent of x.

examples: [2]

max(float64, float64) float64

Max returns the larger of x or y.

examples: [3] [12]

min(float64, float64) float64

Min returns the smaller of x or y.

examples: [3] [6]

mod(float64, float64) float64

Mod returns the floating-point remainder of x/y. The magnitude of the result is less than y and its sign agrees with that of x.

Mu0

Vacuum permeability (Tm/A)

examples: [2]

NewScalarMask(int, int, int) Slice

Makes a 3D array of scalars

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

NewSlice(int, int, int, int) Slice

Makes a 4D array with a specified number of components (first argument) and a specified size nx,ny,nz (remaining arguments)

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

NewVectorMask(int, int, int) Slice

Makes a 3D array of vectors

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [15]

norm(float64) float64

Standard normal distribution

examples: [5] [12]

Normalized(Quantity) Quantity

Normalize quantity

methods: EvalTo( )  

examples: [12]

now() time.Time

Returns the current time

methods: Add( time.Duration )   AddDate( int int int )   After( time.Time )   AppendFormat( []uint8 string )   Before( time.Time )   Clock( )   Date( )   Day( )   Equal( time.Time )   Format( string )   GobEncode( )   Hour( )   ISOWeek( )   In( *time.Location )   IsZero( )   Local( )   Location( )   MarshalBinary( )   MarshalJSON( )   MarshalText( )   Minute( )   Month( )   Nanosecond( )   Round( time.Duration )   Second( )   Sub( time.Time )   Truncate( time.Duration )   UTC( )   Unix( )   UnixNano( )   Weekday( )   Year( )   YearDay( )   Zone( )  

OpenBC

Use open boundary conditions (default=false)

pi

examples: [4] [5] [6] [11] [13] [14] [15]

pow(float64, float64) float64

Pow returns x**y, the base-x exponential of y.

examples: [2] [15]

pow10(int) float64

Pow10 returns 10**n, the base-10 exponential of n.

rand() float64

Random number between 0 and 1

examples: [3] [5] [12] [15]

randExp() float64

Exponentially distributed random number between 0 and +inf, mean=1

randInt(int) int

Random non-negative integer

randNorm() float64

Standard normal random number

examples: [12]

randSeed(int)

Sets the random number seed

remainder(float64, float64) float64

Remainder returns the IEEE 754 floating-point remainder of x/y.

Sign(float64) float64

Signum function

sin(float64) float64

Sin returns the sine of the radian argument x.

examples: [5] [6] [13] [14] [15]

sinc(float64) float64

Sinc returns sin(x)/x. If x=0, then Sinc(x) returns 1.

since(time.Time) time.Duration

Returns the time elapsed since argument

methods: Hours( )   Microseconds( )   Milliseconds( )   Minutes( )   Nanoseconds( )   Round( time.Duration )   Seconds( )   Truncate( time.Duration )  

sinh(float64) float64

Sinh returns the hyperbolic sine of x.

sqrt(float64) float64

Sqrt returns the square root of x.

examples: [2]

tan(float64) float64

Tan returns the tangent of the radian argument x.

tanh(float64) float64

Tanh returns the hyperbolic tangent of x.

ThermSeed(int)

Set a random seed for thermal noise

true

trunc(float64) float64

Trunc returns the integer value of x.

Vector(float64, float64, float64) data.Vector

Constructs a vector with given components

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [1] [3] [5] [7] [10] [11] [12] [14] [15]

y0(float64) float64

Y0 returns the order-zero Bessel function of the second kind.

y1(float64) float64

Y1 returns the order-one Bessel function of the second kind.

yn(int, float64) float64

Yn returns the order-n Bessel function of the second kind.

abs(float64) float64

Abs returns the absolute value of x.

acos(float64) float64

Acos returns the arccosine, in radians, of x.

acosh(float64) float64

Acosh returns the inverse hyperbolic cosine of x.

Add(Quantity, Quantity) Quantity

Add two quantities

methods: EvalTo( )  

examples: [3] [4] [5] [11] [13] [15]

AddEdensTerm(Quantity)

Add an expression to Edens.

AddFieldTerm(Quantity)

Add an expression to B_eff.

Aex

Exchange stiffness (J/m)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

alpha

Landau-Lifshitz damping constant

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [6] [7] [8] [10] [11] [12] [14] [15]

anisC1

Cubic anisotropy direction #1

methods: Average( )   Comp( int )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisC2

Cubic anisotropy direction #2

methods: Average( )   Comp( int )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisU

Uniaxial anisotropy direction

methods: Average( )   Comp( int )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [7] [10] [15]

Antivortex(int, int) Config

Antivortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

asin(float64) float64

Asin returns the arcsine, in radians, of x.

asinh(float64) float64

Asinh returns the inverse hyperbolic sine of x.

atan(float64) float64

Atan returns the arctangent, in radians, of x.

atan2(float64, float64) float64

Atan2 returns the arc tangent of y/x, using the signs of the two to determine the quadrant of the return value.

atanh(float64) float64

Atanh returns the inverse hyperbolic tangent of x.

AutoSave(Quantity, float64)

Auto save space-dependent quantity every period (s).

examples: [1] [10] [11] [14] [15]

AutoSnapshot(Quantity, float64)

Auto save image of quantity every period (s).

B1

First magneto-elastic coupling constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

B2

Second magneto-elastic coupling constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

B_anis

Anisotropy field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_custom

User-defined field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_demag

Magnetostatic field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_eff

Effective field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_exch

Exchange field (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_ext

Externally applied field (T)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [1] [3] [15]

B_mel

Magneto-elastic filed (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

B_therm

Thermal field (T)

methods: AddTo( Slice )   EvalTo( Slice )  

BlochSkyrmion(int, int) Config

Bloch skyrmion magnetization with given chirality and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

cbrt(float64) float64

Cbrt returns the cube root of x.

ceil(float64) float64

Ceil returns the least integer value greater than or equal to x.

Cell(int, int, int) Shape

Single cell with given integer index (i, j, k)

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Circle(float64) Shape

2D Circle with diameter in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [7] [8] [12]

Cone(float64, float64) Shape

3D Cone with diameter and height in meter. The top of the cone points in the +z direction.

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

Conical(data.Vector, data.Vector, float64) Config

Conical state for given wave vector, cone direction, and cone angle

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

Const(float64) Quantity

Constant, uniform number

methods: EvalTo( )  

ConstVector(float64, float64, float64) Quantity

Constant, uniform vector

methods: EvalTo( )  

cos(float64) float64

Cos returns the cosine of the radian argument x.

examples: [6] [13] [14]

cosh(float64) float64

Cosh returns the hyperbolic cosine of x.

Crop(Quantity, int, int, int, int, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[, [y1,y2[, [z1,z2[

methods: Average( )   EvalTo( Slice )  

examples: [8]

CropLayer(Quantity, int) *cropped

Crops a quantity to a single layer

methods: Average( )   EvalTo( Slice )  

CropRegion(Quantity, int) *cropped

Crops a quantity to a region

methods: Average( )   EvalTo( Slice )  

CropX(Quantity, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[

methods: Average( )   EvalTo( Slice )  

CropY(Quantity, int, int) *cropped

Crops a quantity to cell ranges [y1,y2[

methods: Average( )   EvalTo( Slice )  

examples: [8]

CropZ(Quantity, int, int) *cropped

Crops a quantity to cell ranges [z1,z2[

methods: Average( )   EvalTo( Slice )  

Cross(Quantity, Quantity) Quantity

Cross product of two vector quantities

methods: EvalTo( )  

examples: [12]

Cuboid(float64, float64, float64) Shape

Cuboid with sides in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Cylinder(float64, float64) Shape

3D Cylinder with diameter and height in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [5] [6]

Dbulk

Bulk Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

DefRegion(int, Shape)

Define a material region with given index (0-255) and shape

examples: [7] [12] [13]

DefRegionCell(int, int, int, int)

Set a material region (first argument) in one cell by the index of the cell (last three arguments)

DemagAccuracy

Controls accuracy of demag kernel

Dind

Interfacial Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

DindCoupling

Average DMI coupling with neighbors (arb.)

methods: Average( )   EvalTo( Slice )   Region( int )  

DisableSlonczewskiTorque

Disables Slonczewski torque (default=false)

DisableZhangLiTorque

Disables Zhang-Li torque (default=false)

Div(Quantity, Quantity) Quantity

Point-wise division of two quantities

methods: EvalTo( )  

DoPrecess

Enables LL precession (default=true)

Dot(Quantity, Quantity) Quantity

Dot product of two vector quantities

methods: EvalTo( )  

dt

Time Step (s)

methods: Average( )   EvalTo( Slice )   Get( )  

DUMP

OutputFormat = DUMP sets text DUMP output

E_anis

total anisotropy energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_custom

total energy of user-defined field (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_demag

Magnetostatic energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_exch

Total exchange energy (including the DMI energy) (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_mel

Magneto-elastic energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_therm

Thermal energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

E_total

total energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

examples: [13]

E_Zeeman

Zeeman energy (J)

methods: Average( )   EvalTo( Slice )   Get( )  

Edens_anis

Anisotropy energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_custom

Energy density of user-defined field. (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_demag

Magnetostatic energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_exch

Total exchange energy density (including the DMI energy density) (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_mel

Magneto-elastic energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_therm

Thermal energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_total

Total energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

Edens_Zeeman

Zeeman energy density (J/m3)

methods: Average( )   EvalTo( Slice )   Region( int )  

EdgeSmooth

Geometry edge smoothing with edgeSmooth^3 samples per cell, 0=staircase, ~8=very smooth

examples: [4]

Ellipse(float64, float64) Shape

2D Ellipse with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [14]

Ellipsoid(float64, float64, float64) Shape

3D Ellipsoid with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

EnableDemag

Enables/disables demag (default=true)

EpsilonPrime

Slonczewski secondairy STT term ε'

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

erf(float64) float64

Erf returns the error function of x.

erfc(float64) float64

Erfc returns the complementary error function of x.

ExchCoupling

Average exchange coupling with neighbors (arb.)

methods: Average( )   EvalTo( Slice )   Region( int )  

examples: [12]

Exit()

Exit from the program

exp(float64) float64

Exp returns e**x, the base-e exponential of x.

examples: [15]

exp2(float64) float64

Exp2 returns 2**x, the base-2 exponential of x.

Expect(string, float64, float64, float64)

Used for automated tests: checks if a value is close enough to the expected value

ExpectV(string, data.Vector, data.Vector, float64)

Used for automated tests: checks if a vector is close enough to the expected value

expm1(float64) float64

Expm1 returns e**x - 1, the base-e exponential of x minus 1. It is more accurate than Exp(x) - 1 when x is near zero.

ext_bubbledist

Bubble traveled distance (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_BubbleMz

Center magnetization 1.0 or -1.0 (default = 1.0)

ext_bubblepos

Bubble core position (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_bubblespeed

Bubble velocity (m/s)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_centerBubble()

centerBubble shifts m after each step to keep the bubble position close to the center of the window

ext_centerWall(int)

centerWall(c) shifts m after each step to keep m_c close to zero

examples: [10] [11]

ext_corepos

Vortex core position (x,y) + polarization (z) (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_dwpos

Position of the simulation window while following a domain wall (m)

methods: Average( )   EvalTo( Slice )   Get( )  

examples: [11]

ext_dwspeed

Speed of the simulation window while following a domain wall (m/s)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_dwtilt

PMA domain wall tilt (rad)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_dwxpos

Position of the simulation window while following a domain wall (m)

methods: Average( )   EvalTo( Slice )   Get( )  

ext_EnableUnsafe()

Deprecated. Only here to ensure maximal backwards compatibility with mumax3.9c.

ext_InterDind(int, int, float64)

Sets Dind coupling between two regions.

ext_InterExchange(int, int, float64)

Sets exchange coupling between two regions.

ext_make3dgrains(float64, int, int, Shape, int)

3D Voronoi tesselation over shape (grain size, starting region number, num regions, shape, seed)

ext_makegrains(float64, int, int)

Voronoi tesselation (grain size, num regions)

examples: [12] [15]

ext_phi

Azimuthal angle (rad)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_rmSurfaceCharge(int, float64, float64)

Compensate magnetic charges on the left and right sides of an in-plane magnetized wire. Arguments: region, mx on left and right side, resp.

examples: [11]

ext_ScaleDind(int, int, float64)

Re-scales Dind coupling between two regions.

ext_ScaleExchange(int, int, float64)

Re-scales exchange coupling between two regions.

examples: [12] [13] [15]

ext_theta

Polar angle (rad)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_topologicalcharge

2D topological charge

methods: Average( )   EvalTo( Slice )   Get( )  

ext_topologicalchargedensity

2D topological charge density m·(∂m/∂x ✕ ∂m/∂y) (1/m2)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_topologicalchargedensitylattice

2D topological charge density according to Berg and Lüscher (1/m2)

methods: Average( )   EvalTo( Slice )   Region( int )  

ext_topologicalchargelattice

2D topological charge according to Berg and Lüscher

methods: Average( )   EvalTo( Slice )   Get( )  

exx

exx component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

exy

exy component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

exz

exz component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

eyy

eyy component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

eyz

eyz component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

ezz

ezz component of the strain tensor

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFn( int func() [3]float64 )  

F_mel

Magneto-elastic force density (N/m3)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

false

FilenameFormat

printf formatting string for output filenames.

FixDt

Set a fixed time step, 0 disables fixed step (which is the default)

FixedLayer

Slonczewski fixed layer polarization

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [14]

FIXEDLAYER_BOTTOM

FixedLayerPosition = FIXEDLAYER_BOTTOM instructs mumax3 that fixed layer is underneath of the free layer

FIXEDLAYER_TOP

FixedLayerPosition = FIXEDLAYER_TOP instructs mumax3 that fixed layer is on top of the free layer

FixedLayerPosition

Position of the fixed layer: FIXEDLAYER_TOP, FIXEDLAYER_BOTTOM (default=FIXEDLAYER_TOP)

floor(float64) float64

Floor returns the greatest integer value less than or equal to x.

Flush()

Flush all pending output to disk.

Fprintln(string, ...interface {})

Print to file

FreeLayerThickness

Slonczewski free layer thickness (if set to zero (default), then the thickness will be deduced from the mesh size) (m)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

frozenspins

Defines spins that should be fixed

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

gamma(float64) float64

Gamma returns the Gamma function of x.

GammaLL

Gyromagnetic ratio in rad/Ts

geom

Cell fill fraction (0..1)

methods: Average( )   EvalTo( Slice )   Gpu( )  

examples: [4] [6] [7] [8] [9] [11] [12] [14]

GrainRoughness(float64, float64, float64, int) Shape

Grainy surface with different heights per grain with a typical grain size (first argument), minimal height (second argument), and maximal height (third argument). The last argument is a seed for the random number generator.

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

Headroom

Solver headroom (default = 0.8)

heaviside(float64) float64

Returns 1 if x>0, 0 if x<0, and 0.5 if x==0

Helical(data.Vector) Config

Helical state for given wave vector

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

hypot(float64, float64) float64

Hypot returns Sqrt(p*p + q*q), taking care to avoid unnecessary overflow and underflow.

ilogb(float64) int

Ilogb returns the binary exponent of x as an integer.

examples: [2]

ImageShape(string) Shape

Use black/white image as shape

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Index2Coord(int, int, int) data.Vector

Convert cell index to x,y,z coordinate in meter

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

inf

Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.

examples: [4] [7] [11]

isInf(float64, int) bool

IsInf reports whether f is an infinity, according to sign. If sign > 0, IsInf reports whether f is positive infinity. If sign < 0, IsInf reports

isNaN(float64) bool

IsNaN reports whether f is an IEEE 754 “not-a-number” value.

J

Electrical current density (A/m2)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   EvalTo( Slice )   IsUniform( )   MSlice( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [10] [11] [12] [13] [14] [15]

j0(float64) float64

J0 returns the order-zero Bessel function of the first kind.

j1(float64) float64

J1 returns the order-one Bessel function of the first kind.

jn(int, float64) float64

Jn returns the order-n Bessel function of the first kind.

Kc1

1st order cubic anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [12]

Kc2

2nd order cubic anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Kc3

3rd order cubic anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Ku1

1st order uniaxial anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [7] [10] [15]

Ku2

2nd order uniaxial anisotropy constant (J/m3)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Lambda

Slonczewski Λ parameter

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

LastErr

Error of last step

methods: Average( )   EvalTo( Slice )   Get( )  

Layer(int) Shape

Single layer (along z), by integer index starting from 0

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [13] [14]

Layers(int, int) Shape

Part of space between cell layer1 (inclusive) and layer2 (exclusive), in integer indices

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

ldexp(float64, int) float64

Ldexp is the inverse of Frexp. It returns frac × 2**exp.

LLtorque

Landau-Lifshitz torque/γ0 (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

LoadFile(string) Slice

Load a data file (ovf or dump)

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [5]

log(float64) float64

Log returns the natural logarithm of x.

examples: [2] [4]

log10(float64) float64

Log10 returns the decimal logarithm of x. The special cases are the same as for Log.

log1p(float64) float64

Log1p returns the natural logarithm of 1 plus its argument x. It is more accurate than Log(1 + x) when x is near zero.

log2(float64) float64

Log2 returns the binary logarithm of x. The special cases are the same as for Log.

logb(float64) float64

Logb returns the binary exponent of x.

examples: [2]

m

Reduced magnetization (unit length)

methods: Average( )   Buffer( )   Comp( int )   EvalTo( Slice )   GetCell( int int int )   LoadFile( string )   Quantity( )   Region( int )   Set( Config )   SetArray( Slice )   SetCell( int int int data.Vector )   SetInShape( Shape Config )   SetRegion( int Config )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

m_full

Unnormalized magnetization (A/m)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

Madd(Quantity, Quantity, float64, float64) *mAddition

Weighted addition: Madd(Q1,Q2,c1,c2) = c1*Q1 + c2*Q2

methods: EvalTo( Slice )  

Masked(Quantity, Shape) Quantity

Mask quantity with shape

methods: EvalTo( )  

max(float64, float64) float64

Max returns the larger of x or y.

examples: [3] [12]

MaxAngle

maximum angle between neighboring spins (rad)

methods: Average( )   EvalTo( Slice )   Get( )  

MaxDt

Maximum time step the solver can take (s)

MaxErr

Maximum error per step the solver can tolerate (default = 1e-5)

maxTorque

Maximum torque/γ0, over all cells (T)

methods: Average( )   EvalTo( Slice )   Get( )  

MFM

MFM image (arb.)

methods: Average( )   EvalTo( Slice )   Region( int )  

examples: [9]

MFMDipole

Height of vertically magnetized part of MFM tip

MFMLift

MFM lift height

examples: [9]

min(float64, float64) float64

Min returns the smaller of x or y.

examples: [3] [6]

MinDt

Minimum time step the solver can take (s)

Minimize()

Use steepest conjugate gradient method to minimize the total energy

examples: [3] [6]

MinimizerSamples

Number of max dM to collect for Minimize convergence check.

MinimizerStop

Stopping max dM for Minimize

examples: [3]

mod(float64, float64) float64

Mod returns the floating-point remainder of x/y. The magnitude of the result is less than y and its sign agrees with that of x.

Msat

Saturation magnetization (A/m)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Mu0

Vacuum permeability (Tm/A)

examples: [2]

Mul(Quantity, Quantity) Quantity

Point-wise product of two quantities

methods: EvalTo( )  

examples: [10] [11]

MulMV(Quantity, Quantity, Quantity, Quantity) Quantity

Matrix-Vector product: MulMV(AX, AY, AZ, m) = (AX·m, AY·m, AZ·m). The arguments Ax, Ay, Az and m are quantities with 3 componets.

methods: EvalTo( )  

NeelSkyrmion(int, int) Config

Néél skyrmion magnetization with given charge and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

NEval

Total number of torque evaluations

methods: Average( )   EvalTo( Slice )   Get( )  

NewScalarMask(int, int, int) Slice

Makes a 3D array of scalars

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

NewSlice(int, int, int, int) Slice

Makes a 4D array with a specified number of components (first argument) and a specified size nx,ny,nz (remaining arguments)

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

NewVectorMask(int, int, int) Slice

Makes a 3D array of vectors

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [15]

NoDemagSpins

Disable magnetostatic interaction per region (default=0, set to 1 to disable). E.g.: NoDemagSpins.SetRegion(5, 1) disables the magnetostatic interaction in region 5.

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

norm(float64) float64

Standard normal distribution

examples: [5] [12]

Normalized(Quantity) Quantity

Normalize quantity

methods: EvalTo( )  

examples: [12]

now() time.Time

Returns the current time

methods: Add( time.Duration )   AddDate( int int int )   After( time.Time )   AppendFormat( []uint8 string )   Before( time.Time )   Clock( )   Date( )   Day( )   Equal( time.Time )   Format( string )   GobEncode( )   Hour( )   ISOWeek( )   In( *time.Location )   IsZero( )   Local( )   Location( )   MarshalBinary( )   MarshalJSON( )   MarshalText( )   Minute( )   Month( )   Nanosecond( )   Round( time.Duration )   Second( )   Sub( time.Time )   Truncate( time.Duration )   UTC( )   Unix( )   UnixNano( )   Weekday( )   Year( )   YearDay( )   Zone( )  

OpenBC

Use open boundary conditions (default=false)

OutputFormat

Format for data files: OVF1_TEXT, OVF1_BINARY, OVF2_TEXT or OVF2_BINARY

OVF1_BINARY

OutputFormat = OVF1_BINARY sets binary OVF1 output

OVF1_TEXT

OutputFormat = OVF1_TEXT sets text OVF1 output

OVF2_BINARY

OutputFormat = OVF2_BINARY sets binary OVF2 output

OVF2_TEXT

OutputFormat = OVF2_TEXT sets text OVF2 output

PeakErr

Overall maxium error per step

methods: Average( )   EvalTo( Slice )   Get( )  

pi

examples: [4] [5] [6] [11] [13] [14] [15]

Pol

Electrical current polarization

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [5] [10] [11] [14]

pow(float64, float64) float64

Pow returns x**y, the base-x exponential of y.

examples: [2] [15]

pow10(int) float64

Pow10 returns 10**n, the base-10 exponential of n.

Print(...interface {})

Print to standard output

examples: [2]

rand() float64

Random number between 0 and 1

examples: [3] [5] [12] [15]

randExp() float64

Exponentially distributed random number between 0 and +inf, mean=1

randInt(int) int

Random non-negative integer

randNorm() float64

Standard normal random number

examples: [12]

RandomMag() Config

Random magnetization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [3] [5]

RandomMagSeed(int) Config

Random magnetization with given seed

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

randSeed(int)

Sets the random number seed

Rect(float64, float64) Shape

2D rectangle with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [9] [11] [12] [15]

regions

Outputs the region index for each cell

methods: Average( )   EvalTo( Slice )   GetCell( int int int )   Gpu( )   HostArray( )   HostList( )   LoadFile( string )   SetCell( int int int int )  

examples: [7] [12]

Relax()

Try to minimize the total energy

examples: [1] [2] [3] [9] [10] [11]

RelaxTorqueThreshold

MaxTorque threshold for relax(). If set to -1 (default), relax() will stop when the average torque is steady or increasing.

remainder(float64, float64) float64

Remainder returns the IEEE 754 floating-point remainder of x/y.

RemoveCustomFields()

Removes all custom fields again

Run(float64)

Run the simulation for a time in seconds

examples: [1] [7] [10] [11] [12] [14] [15]

RunWhile(func() bool)

Run while condition function is true

Save(Quantity)

Save space-dependent quantity once, with auto filename

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SaveAs(Quantity, string)

Save space-dependent quantity with custom filename

examples: [4] [5] [7] [9]

SetCellSize(float64, float64, float64)

Sets the X,Y,Z cell size in meters

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetGeom(Shape)

Sets the geometry to a given shape

examples: [4] [6] [7] [8] [9] [11] [12] [14]

SetGridSize(int, int, int)

Sets the number of cells for X,Y,Z

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetMesh(int, int, int, float64, float64, float64, int, int, int)

Sets GridSize, CellSize and PBC at the same time

SetPBC(int, int, int)

Sets the number of repetitions in X,Y,Z to create periodic boundary conditions. The number of repetitions determines the cutoff range for the demagnetization.

SetSolver(int)

Set solver type. 1:Euler, 2:Heun, 3:Bogaki-Shampine, 4: Runge-Kutta (RK45), 5: Dormand-Prince, 6: Fehlberg, -1: Backward Euler

Shift(int)

Shifts the simulation by +1/-1 cells along X

examples: [15]

Shifted(Quantity, int, int, int) Quantity

Shifted quantity

methods: EvalTo( )  

ShiftGeom

Whether Shift() acts on geometry

ShiftM

Whether Shift() acts on magnetization

examples: [15]

ShiftMagD

Upon shift, insert this magnetization from the bottom

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftMagL

Upon shift, insert this magnetization from the left

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftMagR

Upon shift, insert this magnetization from the right

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

ShiftMagU

Upon shift, insert this magnetization from the top

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftRegions

Whether Shift() acts on regions

Sign(float64) float64

Signum function

sin(float64) float64

Sin returns the sine of the radian argument x.

examples: [5] [6] [13] [14] [15]

sinc(float64) float64

Sinc returns sin(x)/x. If x=0, then Sinc(x) returns 1.

since(time.Time) time.Duration

Returns the time elapsed since argument

methods: Hours( )   Microseconds( )   Milliseconds( )   Minutes( )   Nanoseconds( )   Round( time.Duration )   Seconds( )   Truncate( time.Duration )  

sinh(float64) float64

Sinh returns the hyperbolic sine of x.

Snapshot(Quantity)

Save image of quantity

SnapshotAs(Quantity, string)

Save image of quantity with custom filename

SnapshotFormat

Image format for snapshots: jpg, png or gif.

spinAngle

Angle between neighboring spins (rad)

methods: Average( )   EvalTo( Slice )   Region( int )  

sprint(...interface {}) string

Print all arguments to string with automatic formatting

sprintf(string, ...interface {}) string

Print to string with C-style formatting.

sqrt(float64) float64

Sqrt returns the square root of x.

examples: [2]

Square(float64) Shape

2D square with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [6]

step

Total number of time steps taken

examples: [3] [10]

Steps(int)

Run the simulation for a number of time steps

STTorque

Spin-transfer torque/γ0 (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

t

Total simulated time (s)

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

TableAdd(Quantity)

Add quantity as a column to the data table.

examples: [3] [11] [13]

TableAddVar(ScalarFunction, string, string)

Add user-defined variable + name + unit to data table.

TableAutoSave(float64)

Auto-save the data table every period (s). Zero disables save.

examples: [1] [11] [14]

TablePrint(...interface {})

Print anyting in the data table

TableSave()

Save the data table right now (appends one line).

examples: [3] [13]

tan(float64) float64

Tan returns the tangent of the radian argument x.

tanh(float64) float64

Tanh returns the hyperbolic tangent of x.

Temp

Temperature (K)

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

ThermSeed(int)

Set a random seed for thermal noise

torque

Total torque/γ0 (T)

methods: Average( )   Comp( int )   EvalTo( Slice )   HostCopy( )   Region( int )  

TotalShift

Amount by which the simulation has been shifted (m).

true

trunc(float64) float64

Trunc returns the integer value of x.

TwoDomain(float64, float64, float64, float64, float64, float64, float64, float64, float64) Config

Twodomain magnetization with with given magnetization in left domain, wall, and right domain

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [10] [11]

Uniform(float64, float64, float64) Config

Uniform magnetization in given direction

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [1] [2] [5] [6] [7] [13] [14] [15]

Universe() Shape

Entire space

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

Vector(float64, float64, float64) data.Vector

Constructs a vector with given components

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [1] [3] [5] [7] [10] [11] [12] [14] [15]

Vortex(int, int) Config

Vortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [8] [9] [12]

VortexWall(float64, float64, int, int) Config

Vortex wall magnetization with given mx in left and right domain and core circulation and polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

xi

Non-adiabaticity of spin-transfer-torque

methods: Average( )   EvalTo( Slice )   GetRegion( int )   IsUniform( )   MSlice( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [10] [11] [12]

XRange(float64, float64) Shape

Part of space between x1 (inclusive) and x2 (exclusive), in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [7]

y0(float64) float64

Y0 returns the order-zero Bessel function of the second kind.

y1(float64) float64

Y1 returns the order-one Bessel function of the second kind.

yn(int, float64) float64

Yn returns the order-n Bessel function of the second kind.

YRange(float64, float64) Shape

Part of space between y1 (inclusive) and y2 (exclusive), in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

ZRange(float64, float64) Shape

Part of space between z1 (inclusive) and z2 (exclusive), in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

3-3.11.1/doc/static/api39c.html000066400000000000000000003023071503346766200157720ustar00rootroot00000000000000 mumax3

Warning! This is the API for mumax3.9c, which is no longer supported. If you like to use mumax3, we strongly recommend to use mumax3.11.

mumax 3.9c API

This is a complete overview of all available functions for writing an input script.

Syntax

The mumax3 input syntax is a subset of Go's syntax, somewhat similar to C. It is case-independent however, so msat is the same as Msat or MSAT.

Defining variables

New variables are declared using :=. Variables have a fixed type, inferred from the declaration's right-hand-side. Assigning to existing variables is done using =. E.g.:
i := 7         // defines a new variable i, type automatically detected to be int
print(i)       // now we can use i
i = 5          // assign new value, don't use ':=' (attempt to re-declare)

str := "hello" // defines str, type automatically is string
//str = 1      // would fail, cannot assign int to string

Arithmetic

Most common arithmetic operations are possible. Also Go's math library and some common constants are available. For raise-to-the-power, pow(x,y) should be used.
x := pi*(3+4)/5
x = pow(x, 3)
x++
y := abs(cbrt(cosh(erf(erfc(gamma(J0(Y0(2))))))))

Control structures

Loops are possible as well:
for i:=0; i<10; i++{
	print(i)
}

Implicit functions

Some of the API features accept a function as argument (e.g.: RunWhile(func()bool), or all input parameters). In that case, and only in this case, the argument is implicitly converted to a function, which is re-evaluated each time it's needed. E.g.:
value := sin(pi*t)  // value is a float64, RHS evaluated only once
Msat = value        // time-independent Msat
versus:
Msat = sin(pi*t)    // RHS converted to function, re-evaluted every time

Setting the mesh size

The simulation mesh defines the size of the box around your magnet. It should be set at the beginning of the script. The number of cells should preferably be powers of two, or at least have small prime factors (2,3,5,7). E.g.:
Nx := 128
Ny := 64
Nz := 2
sizeX := 500e-9
sizeY := 250e-9
sizeZ := 10e-9
SetGridSize(Nx, Ny, Nz)
SetCellSize(sizeX/Nx, sizeY/Ny, sizeZ/Nz)

Periodic boundary conditions

Optionally, periodic boundary conditions can be enabled:
SetPBC(5, 0, 0)        // 5 extra images on left and right sides.
SetGridSize(128, 64, 1)
SetCellSize(5e-9, 5e-9, 5e-9)
Setting a nonzero PBC value in a direction enables wrap-around in that direction. The precise value passed determines how many repetitions are seen by the demag field. E.g., in the above example the demag field behaves as if 5 repetitions are present to the left and to the right side. Choosing a large number may cause long initialization time.

Resizing the mesh

The mesh can be changed at any later time in the simulation. This will cause the magnetization to be stretched onto the new mesh if needed, and the geometry and regions to be re-calculated. After resize some cells which had zero magnetization may now fall inside the magnet geometry, they will be initialized to random magnetization.

SetCellSize(float64, float64, float64)

Sets the X,Y,Z cell size in meters

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetGridSize(int, int, int)

Sets the number of cells for X,Y,Z

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetMesh(int, int, int, float64, float64, float64, int, int, int)

Sets GridSize, CellSize and PBC in once

SetPBC(int, int, int)

Sets number of repetitions in X,Y,Z


Setting a geometry

Optionally a magnet Shape other than the full simulation box can be specified. One can specify primitive shapes, constructed at the origin (box center), and translate/rotate them if needed. All positions are specified in meters and the origin lies in the center of the simulation box. E.g.:
 SetGeom(cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0))

SetGeom(Shape)

Sets the geometry to a given shape

examples: [4] [6] [7] [8] [9] [11] [12] [14]

EdgeSmooth

Geometry edge smoothing with edgeSmooth^3 samples per cell, 0=staircase, ~8=very smooth

examples: [4]

Cell(int, int, int) Shape

Single cell with given integer index (i, j, k)

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Circle(float64) Shape

2D Circle with diameter in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [7] [8] [12]

Cuboid(float64, float64, float64) Shape

Cuboid with sides in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Cylinder(float64, float64) Shape

3D Cylinder with diameter and height in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [5] [6]

Ellipse(float64, float64) Shape

2D Ellipse with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [14]

Ellipsoid(float64, float64, float64) Shape

3D Ellipsoid with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

GrainRoughness(float64, float64, float64, int) Shape

Grainy surface with different heights per grain

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

ImageShape(string) Shape

Use black/white image as shape

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Layer(int) Shape

Single layer (along z), by integer index starting from 0

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [13] [14]

Layers(int, int) Shape

Part of space between cell layer1 (inclusive) and layer2 (exclusive), in integer indices

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Rect(float64, float64) Shape

2D rectangle with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [9] [11] [12] [15]

Square(float64) Shape

2D square with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [6]

Universe() Shape

Entire space

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

XRange(float64, float64) Shape

Part of space between x1 and x2, in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [7]

YRange(float64, float64) Shape

Part of space between y1 and y2, in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

ZRange(float64, float64) Shape

Part of space between z1 and z2, in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  


Defining material regions

Optionally, up to 256 material regions can be defined. Since each cell is made from one material, it is associated with exactly one region. So regions can not overlap. Each cell is assigned material region 0 by default. It's a good idea to output regions to verify whether each cell is assigned to the intended region. Each region can have its own material parameters, and we can output averages over each region. E.g.:
DefRegion(1, circle(1e-6))
DefRegion(0, circle(1e-6).Inverse()) // redundant
save(regions)
Msat.SetRegion(1, 800e6)
tableAdd(m.Region(1))    // add average m over region 1 to table

DefRegion(int, Shape)

Define a material region with given index (0-255) and shape

examples: [7] [12] [13]

DefRegionCell(int, int, int, int)

Set a material region in one cell by index

regions

Outputs the region index for each cell

methods: Average( )   GetCell( int int int )   Gpu( )   HostArray( )   HostList( )   LoadFile( string )   SetCell( int int int int )  

examples: [7] [12]


Initial magnetization

The initial magnetization is set by assigning a Config to m, setting it in separate regions, or by loading a file directly.
m = uniform(1, 0, 0)
m.SetRegion(1, vortex(1, 1))
m.LoadFile("config.ovf")

m

Reduced magnetization (unit length)

methods: Average( )   Buffer( )   Comp( int )   GetCell( int int int )   LoadFile( string )   Region( int )   Set( Config )   SetArray( Slice )   SetCell( int int int data.Vector )   SetInShape( Shape Config )   SetRegion( int Config )   TableData( )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Antivortex(int, int) Config

Antivortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

BlochSkyrmion(int, int) Config

Bloch skyrmion magnetization with given chirality and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

NeelSkyrmion(int, int) Config

Néél skyrmion magnetization with given charge and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

RandomMag() Config

Random magnetization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [3] [5]

RandomMagSeed(int) Config

Random magnetization with given seed

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

TwoDomain(float64, float64, float64, float64, float64, float64, float64, float64, float64) Config

Twodomain magnetization with with given magnetization in left domain, wall, and right domain

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [10] [11]

Uniform(float64, float64, float64) Config

Uniform magnetization in given direction

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [1] [2] [5] [6] [7] [13] [14] [15]

Vortex(int, int) Config

Vortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [8] [9] [12]

VortexWall(float64, float64, int, int) Config

Vortex wall magnetization with given mx in left and right domain and core circulation and polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]


Material parameters

Assigning to a material parameter sets a value in all regions. E.g.:
Msat  = 800e3
AnisU = vector(1, 0, 0)
When regions are defined, they can also be set region-wise:
Msat.SetRegion(0, 800e3)
Msat.SetRegion(1, 540e3)
Material parameters can be functions of time as well. E.g.:
f := 500e6
Ku1 = 500 * sin(2*pi*f*t)

Aex

Exchange stiffness (J/m)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

alpha

Landau-Lifshitz damping constant

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [6] [7] [8] [10] [11] [12] [14] [15]

anisC1

Cubic anisotropy direction #1

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisC2

Cubic anisotropy direction #2

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisU

Uniaxial anisotropy direction

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [7] [10] [15]

Dbulk

Bulk Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Dind

Interfacial Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

EpsilonPrime

Slonczewski secondairy STT term ε'

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

FixedLayer

Slonczewski fixed layer polarization

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [14]

frozenspins

Defines spins that should be fixed

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Kc1

1st order cubic anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [12]

Kc2

2nd order cubic anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Kc3

3rd order cubic anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Ku1

1st order uniaxial anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [7] [10] [15]

Ku2

2nd order uniaxial anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Lambda

Slonczewski Λ parameter

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

Msat

Saturation magnetization (A/m)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

NoDemagSpins

Disable magnetostatic interaction per-spin (set to 1 to disable)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Pol

Electrical current polarization

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [5] [10] [11] [14]

Temp

Temperature (K)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

xi

Non-adiabaticity of spin-transfer-torque

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [10] [11] [12]


Excitation

Field or current excitations can be set in the same way as material parameters:
B_ext = vector(0.01, 1e-6*sin(2*pi*f*t), 0)
B_ext.SetRegion(1, vector(0, 0, 0.1))
Additionally, an arbitrary number of time- and space-dependent vector fields of the form g(x,y,z) * f(t) may be added. (E.g., to simulate the field of an antenna or an arbitrary current running through the magnet)
B_ext.Add(LoadFile("antenna.ovf"), sin(2*pi*f*t))
JPol.Add(LoadFile("current.ovf"), 1)

B_ext

Externally applied field (T)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   IsUniform( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [1] [3] [15]

J

Electrical current density (A/m2)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   IsUniform( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [10] [11] [12] [13] [14] [15]

Index2Coord(int, int, int) data.Vector

Convert cell index to x,y,z coordinate in meter

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

LoadFile(string) Slice

Load a data file (ovf or dump)

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [5]

NewSlice(int, int, int, int) Slice

Makes a 4D array of scalars with given ncomp,x,y,z size

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  


Magnetic Force Microscopy

Mumax3 has built-in generation of MFM images from a 2D magnetization. The MFM tip lift can be freely chosen. By default the tip magnetization is modeled as a point monopole at the apex. This is sufficient for most situations. Nevertheless, it is also possible to model partially magnetized tips by setting MFMDipole to the magnetized portion of the tip, in meters. E.g., if only the first 20nm of the tip is (vertically) magnetized, set MFMDipole=20e-9.

MFM

MFM image

methods: Average( )   Region( int )   Set( Slice )  

examples: [9]

MFMDipole

Height of vertically magnetized part of MFM tip

MFMLift

MFM lift height

examples: [9]


Output quantities

The quantities listed below can be output. Also, derived quantities can be produced: the quantity restricted to a certain region or a single component. E.g.:
m           // magnetization quantity
m.Comp(0)   // x-component
m.Region(1) // magnetization in region 1 (0 elsewhere)

B_anis

Anisotropy field (T)

methods: AddTo( Slice )   Average( )   Comp( int )   Region( int )   Set( Slice )  

B_demag

Magnetostatic field (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

B_eff

Effective field (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

B_exch

Exchange field (T)

methods: AddTo( Slice )   Average( )   Comp( int )   Region( int )   Set( Slice )  

B_therm

Thermal field (T)

methods: AddTo( Slice )  

dt

Time Step (s)

methods: Average( )   Get( )  

E_anis

Anisotropy energy (uni+cubic) (J)

methods: Average( )   Get( )  

E_demag

Magnetostatic energy (J)

methods: Average( )   Get( )  

E_exch

Exchange energy (normal+DM) (J)

methods: Average( )   Get( )  

E_therm

Thermal energy (J)

methods: Average( )   Get( )  

E_total

Total energy (J)

methods: Average( )   Get( )  

examples: [13]

E_Zeeman

Zeeman energy (J)

methods: Average( )   Get( )  

Edens_anis

Anisotropy energy density (uni+cubic) (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_demag

Magnetostatic energy density (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_exch

Exchange energy density (normal+DM) (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_therm

Thermal energy density (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_total

Total energy density (J/m3)

methods: Average( )   Region( int )   Set( Slice )  

Edens_Zeeman

Zeeman energy density (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

ExchCoupling

Average exchange coupling with neighbors (arb.)

methods: Average( )   Region( int )   Set( Slice )  

examples: [12]

geom

Cell fill fraction (0..1)

methods: Average( )   Gpu( )  

examples: [4] [6] [7] [8] [9] [11] [12] [14]

LastErr

Error of last step

methods: Average( )   Get( )  

LLtorque

Landau-Lifshitz torque/γ0 (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

m_full

Unnormalized magnetization (A/m)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

MaxAngle

Maximum angle between neighboring spins (rad)

methods: Average( )   Get( )  

maxTorque

Maximum torque/γ0, over all cells (T)

methods: Average( )   Get( )  

NEval

Total number of torque evaluations

methods: Average( )   Get( )  

PeakErr

Overall maxium error per step

methods: Average( )   Get( )  

spinAngle

Angle between neighboring spins (rad)

methods: Average( )   Region( int )   Set( Slice )  

STtorque

Spin-transfer torque/γ0 (T)

methods: AddTo( Slice )   Average( )   Comp( int )   Region( int )   Set( Slice )  

torque

Total torque/γ0 (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  


Slicing and dicing output

To save storage space, it's possible to save only the part of the output we're interested in. This works on all output quantities (not only m)
save(m)                         // save full magnetization
save(m.Comp(0))                 // save only x-component
save(CropLayer(m, 13))          // save only layer 13
save(CropLayer(m.Comp(0), 13))  // save only x-component of layer 13
Or even:
mx   := m.Comp(0)
mx13 := CropLayer(mx, 13) 
save(mx13)
tableAdd(mx13)

Crop(Quantity, int, int, int, int, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[, [y1,y2[, [z1,z2[

methods: Average( )  

examples: [8]

CropLayer(Quantity, int) *cropped

Crops a quantity to a single layer

methods: Average( )  

CropX(Quantity, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[

methods: Average( )  

CropY(Quantity, int, int) *cropped

Crops a quantity to cell ranges [y1,y2[

methods: Average( )  

examples: [8]

CropZ(Quantity, int, int) *cropped

Crops a quantity to cell ranges [z1,z2[

methods: Average( )  


Scheduling output

All input and output quantities (as described above) can be saved in a space-dependent way (".ovf" file), or as spatial averages (table output). The data table ("table.txt") contains by default the time and average magnetization. More columns can be added with TableAdd().
save(B_ext)

tableadd(B_ext)
tablesave()
Optionally, the output/averaging can be done over a single region:
save(m.Region(1))
TableAdd(m.Region(1)) 
User-defined variables can be added to the table with TableAddVar().
myField := 0.42
TableAddVar(myField, "B_extra", "T")
myField = ...

AutoSave(Quantity, float64)

Auto save space-dependent quantity every period (s).

examples: [1] [10] [11] [14] [15]

AutoSnapshot(Quantity, float64)

Auto save image of quantity every period (s).

FilenameFormat

printf formatting string for output filenames.

Fprintln(string, ...interface {})

Print to file

OutputFormat

Format for data files: OVF1_TEXT, OVF1_BINARY, OVF2_TEXT or OVF2_BINARY

OVF1_BINARY

OutputFormat = OVF1_BINARY sets binary OVF1 output

OVF1_TEXT

OutputFormat = OVF1_TEXT sets text OVF1 output

OVF2_BINARY

OutputFormat = OVF2_BINARY sets binary OVF2 output

OVF2_TEXT

OutputFormat = OVF2_TEXT sets text OVF2 output

Print(...interface {})

Print to standard output

examples: [2]

Save(Quantity)

Save space-dependent quantity once, with auto filename

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SaveAs(Quantity, string)

Save space-dependent with custom filename

examples: [4] [5] [7] [9]

Snapshot(Quantity)

Save image of quantity

SnapshotFormat

Image format for snapshots: jpg, png or gif.

sprint(...interface {}) string

Print all arguments to string with automatic formatting

sprintf(string, ...interface {}) string

Print to string with C-style formatting.

TableAdd(TableData)

Add quantity as a column to the data table.

examples: [3] [11] [13]

TableAddVar(ScalarFunction, string, string)

Add user-defined variable + name + unit to data table.

TableAutoSave(float64)

Auto-save the data table every period (s). Zero disables save.

examples: [1] [11] [14]

TablePrint(...interface {})

Print anyting in the data table

TableSave()

Save the data table right now (appends one line).

examples: [3] [13]


Running

Run(time) runs the simulation for a given time in seconds, using sensible error settings.
Run(1e-9)
More fine-grained control is provided by RunWhile(condition), which runs as long as an arbitrary condition is met. E.g.:
mx := m.comp(0)
RunWhile(mx.average() < 0)   // search for switching field during reversal
Optionally, the solver accuracy may be fine-tuned. E.g.:
MaxDt = 1e-12
MinDt = 1e-15
MaxErr = 1e-6
Optionally, a different solver may be chosen (at any point) with SetSolver(int). Currently available solver types:
  • 5: RK45 (Dormand-Prince) solver (the default). An accurate solver, very fast for magnetization dynamics at the cost of some memory usage.
  • 4: Classical 4th-order Runge-Kutta method. Intended for simulations where a fixed, relatively large time step is desired.
  • 3: RK23 (Bogacki-Shampine) solver. A robust and reasonably fast solver with low memory requirements. Typically outperforms RK45 when relaxing the magnetization with little dynamics, so it used internally by Relax().
  • 2: Adaptive Heun solver. Robust and uses very little memory but takes smaller time steps than the higher-order solvers. Also suited when a fixed, relatively small time step is desired.
  • 1: Euler solver (requires FixDt = ..., ignores other settings). Only useful in exceptional situations or for debugging.
E.g.:
SetSolver(2) // Heun
FixDt = 1e-15

Relax

Relax() tries to evolve the magnetization as closely as possible to the minimum energy state. This function assumes all excitations have been turned off (temperature, electrical current, time-dependent magnetic fields). During relax precession is disabled and the time t does not increase. There is no need to set high damping.

In general it is difficult to be sure the minimum energy state has been truly reached. Hence, relax may occasionally return after the energy has reached a local minimum, a saddle point, or a rather flat valley in the energy landscape.

Minimize

Minimize() is like Relax, but uses the conjugate gradient method to find the energy minimum. It is usually much faster than Relax, but is a bit less robust against divergence. E.g., a random starting configuration can be Relaxed, but may fail with Minimize. Minimize is very well suited for hysteresis calculations, where we are never far away from the ground state.

Minimize()

Use steepest conjugate gradient method to minimize the total energy

examples: [3] [6]

Relax()

Try to minimize the total energy

examples: [1] [2] [3] [9] [10] [11]

Run(float64)

Run the simulation for a time in seconds

examples: [1] [7] [10] [11] [12] [14] [15]

RunWhile(func() bool)

Run while condition function is true

Steps(int)

Run the simulation for a number of time steps

FixDt

Set a fixed time step, 0 disables fixed step

Headroom

Solver headroom

MaxDt

Maximum time step the solver can take (s)

MaxErr

Maximum error per step the solver can tolerate

MinDt

Minimum time step the solver can take (s)

MinimizerSamples

Number of max dM to collect for Minimize convergence check.

MinimizerStop

Stopping max dM for Minimize

examples: [3]

step

Total number of time steps taken

examples: [3] [10]

t

Total simulated time (s)

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetSolver(int)

Set solver type. 1:Euler, 2:Heun


Moving simulation window

Mumax3 can automatically shift the magnetization so that the simulation "window" stays centered on a region of interest. Shifting is done to keep a freely chosen magnetization component nearly zero. E.g.
ext_centerwall(0)
ext_rmSurfaceCharge(0, -1, 1)
TableAdd(TotalShift)
will try to keep mx (component 0, counting from 0) close to zero. If desired, one can override which "new" magnetization is inserted from the sides by setting ShiftMagL and ShiftMagR, though the default behaviour is usually OK.

Shift(int)

Shifts the simulation by +1/-1 cells along X

examples: [15]

ShiftGeom

Whether Shift() acts on geometry

ShiftM

Whether Shift() acts on magnetization

examples: [15]

ShiftMagL

Upon shift, insert this magnetization from the left

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftMagR

Upon shift, insert this magnetization from the right

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

ShiftRegions

Whether Shift() acts on regions

TotalShift

Amount by which the simulation has been shifted (m).


Extensions

Extensions are extra functionalities that are not officially supported. They are aimed at rather specific problems and may not work as expected for your particular situation. Their API and functionality may change in future releases.

ext_bubbledist

Bubble traveled distance (m)

methods: Average( )   Get( )  

ext_bubblepos

Bubble core position (m)

methods: Average( )   Get( )  

ext_bubblespeed

Bubble velocity (m/s)

methods: Average( )   Get( )  

ext_centerWall(int)

centerWall(c) shifts m after each step to keep m_c close to zero

examples: [10] [11]

ext_corepos

Vortex core position (x,y) + polarization (z) (m)

methods: Average( )   Get( )  

ext_dwpos

Position of the simulation window while following a domain wall (m)

methods: Average( )   Get( )  

examples: [11]

ext_dwspeed

Speed of the simulation window while following a domain wall (m/s)

methods: Average( )   Get( )  

ext_dwtilt

PMA domain wall tilt (rad)

methods: Average( )   Get( )  

ext_EnableUnsafe()

Allow potentially unsafe features, at your own risk.

ext_makegrains(float64, int, int)

Voronoi tesselation (grain size, num regions)

examples: [12] [15]

ext_rmSurfaceCharge(int, float64, float64)

Compensate magnetic charges on the left and right sides of an in-plane magnetized wire. Arguments: region, mx on left and right side, resp.

examples: [11]

ext_ScaleExchange(int, int, float64)

Re-scales exchange coupling between two regions.

examples: [12] [13] [15]

ext_topologicalcharge

2D topological charge

methods: Average( )   Get( )  

ext_topologicalchargedensity

2D topological charge density m·(m/∂x ❌ ∂m/∂y) (1/m2)

methods: Average( )   Region( int )   Set( Slice )  

EnableDemag

Enables/disables demag (default=true)

Expect(string, float64, float64, float64)

Used for automated tests: checks if a value is close enough to the expected value

ExpectV(string, data.Vector, data.Vector, float64)

Used for automated tests: checks if a vector is close enough to the expected value


Misc

Other available functions.

abs(float64) float64

acos(float64) float64

acosh(float64) float64

asin(float64) float64

asinh(float64) float64

atan(float64) float64

atan2(float64, float64) float64

atanh(float64) float64

cbrt(float64) float64

ceil(float64) float64

cos(float64) float64

examples: [6] [13] [14]

cosh(float64) float64

DemagAccuracy

Controls accuracy of demag kernel

DisableSlonczewskiTorque

Disables Slonczewski torque (default=false)

DisableZhangLiTorque

Disables Zhang-Li torque (default=false)

DoPrecess

Enables LL precession (default=true)

DotProduct(Quantity, Quantity) *dotProduct

Dot product of two vector quantities

methods: Average( )  

DUMP

OutputFormat = DUMP sets text DUMP output

erf(float64) float64

erfc(float64) float64

Exit()

Exit from the program

exp(float64) float64

examples: [15]

exp2(float64) float64

expm1(float64) float64

false

floor(float64) float64

Flush()

Flush all pending output to disk.

gamma(float64) float64

GammaLL

Gyromagnetic ratio in rad/Ts

heaviside(float64) float64

hypot(float64, float64) float64

ilogb(float64) int

examples: [2]

inf

examples: [4] [7] [11]

isInf(float64, int) bool

isNaN(float64) bool

j0(float64) float64

j1(float64) float64

jn(int, float64) float64

ldexp(float64, int) float64

log(float64) float64

examples: [2] [4]

log10(float64) float64

log1p(float64) float64

log2(float64) float64

logb(float64) float64

examples: [2]

max(float64, float64) float64

examples: [3] [12]

min(float64, float64) float64

examples: [3] [6]

mod(float64, float64) float64

Mu0

Permittivity of vaccum (Tm/A)

examples: [2]

NewScalarMask(int, int, int) Slice

Makes a 3D array of scalars

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

NewVectorMask(int, int, int) Slice

Makes a 3D array of vectors

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [15]

norm(float64) float64

Standard normal distribution

examples: [5] [12]

now() time.Time

Returns the current time

methods: Add( time.Duration )   AddDate( int int int )   After( time.Time )   Before( time.Time )   Clock( )   Date( )   Day( )   Equal( time.Time )   Format( string )   GobEncode( )   Hour( )   ISOWeek( )   In( *time.Location )   IsZero( )   Local( )   Location( )   MarshalBinary( )   MarshalJSON( )   MarshalText( )   Minute( )   Month( )   Nanosecond( )   Round( time.Duration )   Second( )   Sub( time.Time )   Truncate( time.Duration )   UTC( )   Unix( )   UnixNano( )   Weekday( )   Year( )   YearDay( )   Zone( )  

pi

examples: [4] [5] [6] [11] [13] [14] [15]

pow(float64, float64) float64

examples: [2] [15]

pow10(int) float64

rand() float64

Random number between 0 and 1

examples: [3] [5] [12] [15]

randExp() float64

Exponentially distributed random number between 0 and +inf, mean=1

randInt(int) int

Random non-negative integer

randNorm() float64

Standard normal random number

examples: [12]

randSeed(int)

Sets the random number seed

remainder(float64, float64) float64

Sign(float64) float64

Signum function

sin(float64) float64

examples: [5] [6] [13] [14] [15]

sinc(float64) float64

since(time.Time) time.Duration

Returns the time elapsed since argument

methods: Hours( )   Minutes( )   Nanoseconds( )   Seconds( )  

sinh(float64) float64

sqrt(float64) float64

examples: [2]

tan(float64) float64

tanh(float64) float64

ThermSeed(int)

Set a random seed for thermal noise

true

trunc(float64) float64

Vector(float64, float64, float64) data.Vector

Constructs a vector with given components

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [1] [3] [5] [7] [10] [11] [12] [14] [15]

y0(float64) float64

y1(float64) float64

yn(int, float64) float64

3-3.11.1/doc/static/download310.html000066400000000000000000000143441503346766200167360ustar00rootroot00000000000000 mumax3

Warning! These are downloads for mumax3.10, which is no longer supported. If you like to use mumax3, we strongly recommend to use mumax3.11.



Prerequisites

To run mumax3.10 you need
  • An NVIDIA GPU with at least a compute capability 3.0
  • An up to date NVIDIA driver (compatible versions given below)
  • Optional: gnuplot for plots in the web GUI

Download and installation

Select the platform and the NVIDIA driver for which you want to download mumax3.

After downloading and unpacking the archive, you will have a mumax3 executable which is ready to be used. Note that mumax3 is a command line application, so it is a good idea to add the directory containing the mumax3 executable to the PATH environment variable.

Building mumax3.10 from the source

The source code of mumax3.10 as well as the build instructions for linux can be found on github. 3-3.11.1/doc/static/gpus.svg000066400000000000000000001125701503346766200155140ustar00rootroot00000000000000 Gnuplot Produced by GNUPLOT 5.4 patchlevel 8 0 200 400 600 800 1000 1200 1400 1600 1800 2000 OOMMF (CPU) 940MX MX250 MX150 GTX 860M GTX 1050 (mobile) GTX 1050Ti (mobile) RTX 2050 (mobile) GTX 1650 (mobile) GTX 970 GTX 1060 GTX 1060 (mobile) GTX 980 Quadro M5000 GTX 1660 GTX 1070 GTX 1080 Tesla T4 Tesla M40 GTX 980 Ti RTX 2000 Ada RTX 3050 Ti Quadro P5000 Tesla P40 GTX 1660 Ti GTX TITAN X (Pascal) GTX 1080 Ti RTX 2060 RTX 2070 RTX 4060 Ti RTX A4000 (ECC On) GTX TITAN Xp Tesla P100 RTX 2060 SUPER RTX 2080 Tesla P100 SXM2 RTX 3080 (mobile) RTX 2080 SUPER RTX 2080 Ti RTX 3060 Ti TITAN V RTX 4070 RTX 2080 Ti OC TITAN RTX RTX 3070 Ti Lite Hash Rate RTX A6000 (ECC On) A40 V100-SXM2-32GB RTX 3080 RTX A6000 (ECC Off) RTX 3080 (12Gb) RTX 3090 A100 RTX 4090 RTX 5090 H100 throughput (M cells/s) "oommf4M.txt" u (0):(4*$1**2 * $2 /$3/1e6):xtic("OOMMF (CPU)") 3-3.11.1/doc/static/header.js000066400000000000000000000016331503346766200156000ustar00rootroot00000000000000function mod(n, m) { return ((n % m) + m) % m } function xmas() { // Show xmas icon from Christmas eve until the first weekend starts after Jan 4th var now = new Date(); var year = now.getFullYear(); var xmasBegin = new Date(year, 11, 24); // December 24th 0:00:00, so day of Christmas eve var xmasEnd = new Date(year, 0, 5); // January 5th 0:00:00, earliest moment to end xmas var xmasEnd = new Date(year, 0, xmasEnd.getDate() + mod(6 - xmasEnd.getDay(), 7)); // Weekend after xmasEnd (Fri/sat midnight) var its_xmas = now >= xmasBegin || now <= xmasEnd; var xmas_img_file = its_xmas ? "nimble-cubes128-xmas.png" : "nimble-cubes128.png"; document.getElementById("header_img").setAttribute("src", xmas_img_file); document.querySelector("link[rel~='icon']").href = xmas_img_file; } xmas() setInterval(xmas, 5 * 60 * 1000) // Repeat every 5 minutes to update icon if page is left open3-3.11.1/doc/static/mfm.svg000066400000000000000000000222061503346766200153110ustar00rootroot00000000000000 image/svg+xml MFMLift MFMDipole - + 3-3.11.1/doc/static/nimble-cubes128-xmas.png000066400000000000000000000331421503346766200202660ustar00rootroot00000000000000PNG  IHDR>asBIT|d pHYs  tEXtSoftwarewww.inkscape.org< IDATxyeeuY>SCWU2HD P71A3cnGsMxss?M4W%"C4S7=5W:]?}ihνyy=]ֶ~`9^˼̦\n â$aov–`\w>ȿ x\:%x~{~y('\ _ w9Ws*Ci;|>xxvW9T>(fiuBak߅o:[מr)o.<^G[rR)῟ O9!ز}N={""%Jޝ`1t.Ym =Qk8rR(o"蜫5AEpg`\e6e+a-.(Z ^,Cek>ǯ6;^Y ׷ #NR0oϻ 7ky)ipkͰs4S p3|^(Z^IXKOJUw p#}PDڡp>\^v5MG\o~ tZ^`r +#sWyosn\ z-*жG_1kz) p,|X5k-g/6m+*hs%vr ^dR=t]ާg +hJtaWspB28s='YRşnnR˟")wq$ 2D|ŧ~MىL+WKe`5XoWqje; nf$unNdL=)^1|B>ޖcQxoJa?oBP8@#>.59 K Rũ  L) `sp8rLr< ғmр=n׾Z pܻ' z#w2}p#.^+>8V+>8\ xᜃkkO===g}v+](@_X='R^ V^^~%-<,pm7dUŋ.=yubOOOme;. pu|cfʛ;:}–?===.իW_988Ў;}6>82>li\3evbc/ٖpg/a=s ? p#[پ f70}_3:K@?+U~哟|IO/^~7o ټ\HMx i"1z(b'?_p\E^YU7§VşHoBmh±+;EtfM~n|c7W2ox1SG L5egp_Ŵ_烖J  7W]=5|o&ҏZ ?.̦d3W\74A W\qťt?n}V,59qc"UǻQ|*0y݌s$AD?Y^.[NRW,¦1 _uۙ*IP.iii9]WuJQ01ӦZ#\ QO)?Ml~ z뛚bll Ueūs~G1ap\c.QVM PFaAضv'^&,ViVh[špb(Ԁ]tt,*Jc3\vہ/1*NԸN_Ӕy6c[×]oHV\uջ^ ]|;pV[s&1~gm ?7nl~>MҨg`3_w݂3>9Gz8imm=~ p%\{YϢ kt$^]B2222211Aq8L/: X+Og])z衟 d{5?NeO?'R}28/n힞wk4%MS2q|8-nÆ ?<4bF ׬9cc-gS޽ktrٔfي_ Q;\0~Jߐ)A`R!qS鉞z=v_} >k˖]>˷pF8gS;g"`֎÷_~5;wʻ6lիOYhQI8[l}kwq?\};:Om1:JЋ 7IԳۍ'յq'_u'^׿{S?0>>s=?[.>׿Z[nʫ;g>ï};la~ƎfŒZ 20J%Jm3oREx%ie㌗n 0~obwE6l+_?MӲ_?g ??G…Tj5rbV{ sK |+^+|ncᣏKO ^2@o2- kisW3168qjo+V-9]dxz|INtJ: i)@kk۫R0Y|?á^of/|J w}t<yR0'j~% uQ*rW\q>5A9~K^W`7۬#6aQkӴ% Gg?H{v_>㓟9SO?9.~{?Q޹JЬ$Ho8fbp`zHyE_(J ';I|37n$ħOvtP a[ʇ#a݌B.Ӣ:j7|i6?9i:]!9嬳w4 ܿ}7#~ .xخ;}n J[y͖޳N?"s<'Q <^|)k9y>?«"j7xGv?quױ"hyq[XCg.ںun0ʴSeC\'*8IƶRgYr%Wp?d]\|y/9?qW@nJO~R7>;;`! ΡF#u4t6Ѐ/= osu>_Ȫ׾U=HWS߰I^5YW!VQ**jj0bA=zEP,k_`tt=M,^_d媓z^$}}\jHu3+SUj;9<Q#$Dή7+*1o+_R.|[i{RݓUa/g[[O%Eq48ԡT2`1j1b!4vJm/nhjcǎ \O0Tx M=D+S3ں;oԫS\ Q4jEpPTSX,qmgbbnN_C7Z6xk#VsuqG]RVIc AxAԬK\I.{׭c!"k Gu 3_[1 60ApMZDP!01" "OPs<l@)ݝwBH$AuuFFF^Ib#ҽٔFI Ruy3[ j` 0 狌M011xB. ¼Ao XETQF'5VD%ő 1!FQ!a1q29V@d*vR,sV{?VQשVzrLR9dQ^J:;;e˖]U޸q :oW-VUC[tm ;clkPODINA8%%@8 J a>OPA1  :] U1Sh%Qbw(}|;n%_h W$ cccG6)B!Lz/bP1P˦05YBY"*1b $R OE@p4iҘ0!3J(jDR(G]I})Rظ! K.?};n"ֈe+~۲G -mZZf|s7mڴord CoӇVT244aV Bbq"ܜ'F-I3'g78z8ڙĒb0 LE#${/87zZ!Z\WbOM掏yb_Jk$? .P1"VX0> gT@$FD1XbP@(0 ǣOo}^.& 4wkXRTT H0?S\TC y+$ơjz7A75ˌLNAa'J HcGQZr_e`hp;{'ol{Q锌 [:~q<ϝgLWT,^ދ'&]|q= !Hq >g)f"15uf j5%dŨ0f%&EELH-iS;{5O$`1#.r*ݝmtwpۗ>߮F_7l9i[uy7UHw!c0|2V-wZ,"Y/PAbl hppqbQ\F:%Jj|({` +> ̡BAXH)#fF&*r$.RK!!b 0DTPc,jegL[\e>6HA-x-9-ne)Jm?Dςn GbR橱޴kz<:#VHԂ2wл<氚\SC@`rjdZJ(햾ݔ"GKU= :;Cz:iio!!A64bq~Jr{%t/\3uJ` R78WBK0ĩ1<<оA:z)rX@`"bQ_iFw(bo+"S&p`xOmgM qLSL5h U BT$MƇdxtgY# ebdRŊ6= 3"M[k Y\t Ijpz ! axY7F1Ҵv!kť$ClT=ZK$25`b|ɱQgi],Јck0p3D"0OH%" CZ;vSjXKPc1NQfBJ`>KKruR{ڮVQW0Rp{!qJH :,60FNb1GJHJ ШיԨTU Zz5(tBF&<HRbޮNZJEJ-y-yr }RMP tY zXمޢ vԇYo & ,}YHÆͶL*s<efũP ;q5(#w\4;^-o+=/d s{hv(WWqƠ /ѽO2<88,RKBB0QH.̓CrEKT )55G VrēPKk@*1Qc,U Ҩg>ŧV#xA4`dbhЦ摟9+A1*C*!YR&5,Qf $}R,ӔDR-a`1b0Ɛ! >s&FLHOg %/ދ .UU{P+@Am5ԓ[}>yh& {BlbM}L[[o0IDATXȱ7XMZ겝 3.`OgXA{ٞ]VZLk{ Q8_7 2]K*|dmUţ \fĐhs>1R[ mS, 6}iRvz[O 0bPT&=m4je=ĈJ+]hi\ej8tv7~sH2Ρ&\GjY#>e4P_bqix:LI q\Śqx륂=-+ck * ފc2mΉB\C{s" T9*}HpyT1a`Ig]EKK LHɘ)8 DhF<ؔžT!+ g0dk:e):.+nqvt@OD1Ri!% sIG`д *uF(!ͱpA'QJȮTj=4މ< 4ТZ y(FCڌE &C:C 3hb3eibU>XCxW5;QH?gk)I GgW :45c`"L&dFV5QO LlXELF)GpSTI%UxߤP+Yx["_Dꌏu___,qi[=7WʓcOy4E1\HWw'V}{=W|6pY՟5 =K9VX8ُs~hEhl$q|Zk jb1΂hwXKb\y0b myܔi/"$g _0 v04D\>7j7k`S?@ר{%c$YDQM,Xt3sɎIQ)ł6Xu\@K1XI@`9|:QSuP)Z.n3\0O(G>Nly9?V}I7k, :E DcV?g$8 P4sN $$ޕHD%iEj1Yns0l;VF8cJb (qx_|v ZMc.3ZVm%,\ޮnvJ1@L|m?20%)Kg`+a# gg2'(Faqֻ-[h^ dZgrx0&pr9n܂'`cv0^jQe!@6FLC1%ήܶttp.T@JP-bb$U |zj6xB -h$ 4Yx%ٜ Y{UߧmAmi̾ajqbZ WUٕm ON)rw ;s=ztllL[֯']ֶ{K sCBbPȕ0d=gqi58]El*3;,W D< 7λBcth'γL%BVRk cUzJi"|HM#;頭-AnpC[]~+v M/xjc֯_V`Fӧv2}86XT\S޽E2FCM3Iuyu&7ӹldD}{X0Љ m+346FKL+߹\%ZZ{[ػɳbY.[@xqytAjdqW֭Cn_l:k֯?AM{~99- RcI\M}mdG@[%1b%ಮ.=9CYC("m$Lx1zSyf>y yL$DC!Zi+x/(hkP,I(w5+(coPmdp$ Q Q"c%J Š?|cg*(:#D͊4G 0I1( F(r]4H @2lOnq1>1ʁ=9,t#z{)E ;Xn{僚:RTX;:|<@59Ndаd0(`k%x5 97޻:Z/87 F12U"J#N888Dur ) J4/_ORLP4rщq*Xɑ/XZ]IZ"ADkki+ZЉU'.E"O ħZZ%#$j<DBk[.ebr4_wu}i羳f9a*«zξ]t6U{INӄxl I/ F̾{/Sk4HAL k۷bG21Fh^Y߃C(vvRŨ(pKiok8CCxb4DZ8)BHa$&I RUk$!1W-hb<b~cQzKdZ<%$η[#5P۸F`Ͼ=4*~Kaي,衷.:z(͑Mnxct ŕmx(F]v URB0ȆUdemg^0g|;{=vqscdIփ>ئ *h6&(А #ПZdFԀ r! NkOQ7\`A dz6b !2I&Ts.f 2KqU* BQ4͂R WLP/15 $BhJ$i#:' Vx<;LN??+LHPX9`o GOw'vˇY3uLVW|1'}g $74YE@ .C+Q?ΩLM5-QFߘ_NebU1(FT,e.7LXxùʰm>?lTpumriERl/X1`bM5,mE+(:m՗]H7J} ƗS/d}d~SRIZ B$!ZC O2ʹT88Rgw#NIc#i6C8_=V;=eʓ?uLj$s)Fe1 "!HQTLF6OK%ʠf#pr6|Lvڝ"iVS_&68a4q'Z8iLQ%$d+[opGgע'Fv 6L22[Xe6ª=ha˺:[; $DAcCҴMŁų%~Ơ=4#pdB>y\j tQ"8X16^u6WdSť#Z 'Y`qVA u:K2}ORA,;w6AWTlMv Qm2pD8Mټ {cYkjA$F}lONrCFւ}4]ض|Y_Y[9*׿}=5I!!!q1F 4+cu8B*Vb,cIQ$JKWl WdMhp;3[4R@9:;8)ߑ ѵnM?rpSL>yW=xqa6z; {;ִI!M-wsMaD0|:_B +xdêQRR#4ƉҖӖժ[wl^z9XO?t ::om8jի2Ov5/ RC@$4 Cqm0&+3 4PB*:SLLM&!ZtRgǎ?'֭G:^2;iO?8 Nk_Z+YqZ'TI]>Ҽ4 <feYF7r.HDVcRFF}ks;O?mMM/ /֯Tnt$eG9a .&\ ;S*Im$s{ & e#xm Vw䢀0-ֲabl߭N?mpdx>-+PuxabKGD pHYs  tIME UXC IDATxkmUc̹{s}=Z%$xHXGa1E(JJ*@9ATS. L('VGƑ@H"Pd![Rw~yk9΅ vUg/:{ݵ֜s1ߘ_o'GO_yWLϽ/ y۟}~C$]w DpղzL155r_q9~$W[W~ۿ:N~pgWr 7Ș'%}Go~x7py2[ނE"J1=wI{_;D=PK}뫱x~7ǟQsO1 $Dm!ӊ]jr:7&>Gwͅo$eBN= b&@]4{{Ls R_z_/ott?!/_㭯&/Or"dI)4$Z Ji֘fal4Δ4w\V|tG$N%BH5vHJA#IoXTkjZp57rww݅mO&D"be̗o?C!jg[8_y+¯o3oYϞ O柿7?|;7<-Т,M$O4d M ќ&^ xK@+V2RڄDuD0dLׄQhm›܉ր@ȮV+N쥑O|sg㟊m}-MxI_o}bgS}?}tc|{9?bm=y-N#23{ʼY8q|ql== 9^3 TaJ +i1 +6blw<- FuA tˉ͊~wzo`-xsԳl˳LU9h%(+]!:-0_r!i8qW|%'䎯:y杏F<N zO|k~ ݽ 섏0H(iIhmZ&D+:n&B+N9ا6]V9ͩ!*(֐&i*DE[Y%:AQ"W6ѱ-] Abf E?PJ]7Ϗd>u7*ePMP`m=eJ&!Kuw: 7V>?GY8b9|r͜>'[KbOй',6LD0T-4k>Җ8~|M6/ 'RDQ0p § #oTO@cyT9VPi(K5y$N0J2j ;Kx2wL$͔)LЂF&pUg͸Iwz V"O|O\>1WŢ-Q˖;TLz "#B2>#UUʅKrz{;'H)hse*h ^r@slśeiVctVR"%A K4H\8T^vV B!ioTqj+Q kka_cl}b0/&46cA%C#H0 E&q1c6j8~~.VQ@^^UVi Agx,͖"8m\2갑ybbVE!5TfyXfZedc0HhV+^IO|)NOǏwN @ BӠ`U*NeH#$bDjX3 Ͷhiu/ l3ֆԎ-YѺ/WtB4 CquT;4AV$4P`B}/oMqgnMB2׎=g;]>} 41:JsȤȸ4lP, t7S#A⢈@@q&gS3Np8yeQ 7cs #-hPmX q 3Hмtf*a63\O~,V':"Fk e(72ivxk|??\!MHUaN/p؃ 2W} 7|߹3\LMOMADA2J"(%HBq2FJY' ]`kyF6$R4먱Jd1&A" hjYɲYcÆ]]D̅iδp4% 㒋%6ftTj)23D^S#~s r$7\ 7?g67QOFwpu=MB<!47@9{[gstm/gė sh2cRaV<+y!ilHgd=Z81g(BHO=}nӣEDnp3QM neT@ZµAkԔHl{҄fsT㥡I&+ݠ\i;x^d/U%Ig2Z:0 XCJ/G0ӉQ3E](1'E ^]~@] 2.AԄ7Cx=t$WX311D4#i5YI*` :c2H˞)ZEҌ{Z8A HEr,26AQ+ 4*iI$HH2^#Y;ssVIYgR %-:z1! sr:43B:WʼDCiցIқ_~oJwŞMDgd}lV mzS/r!%8i=-/O<~2obU"  FW s4e%HEp 0rcCef'[wj. Au!b\=H\HkL`ɨEL6htCǮRn4g^v:07|ӷL9( ωsyYa C&uLM hCŹheW iFg$ + tcN/ & $I ẸSs椞味\u T/R1! j+N#Q$&G(a8FSLD(eqa|}ǟ@} |̠5~v7̸RF2"-?+X$Au Koc +L譑XӒᲡK3=WEEs#\abM`5sɋO%9S#X!~HĀDk=hY=@$B)$$Z-J4spC¶@)3UGI q`'0Z*hR0K&ԞF!T)MpˆAHQQ-:v:={><AiSuZ bФ@ ׊R"ZIO}jynܻz'_zK] w8W$ [ ,j&G)c\4xMV鳢lɧ ԑZHD@}-ZjDdQ\gth ϊQ+x3w-Ag\b7kjQ8xJan5(>hN!ЂKBUH{bEv6+xMW׼}N$xwG/a}QTm9~$'~I6]vV #Kvl֑kO#+P#X50*T1UHR+0i&%ԫU3vvv p QiHxbc EۢDj=n#.*9&Z jjI1.IsI毻YI^{/?+<|79s).@Vl~:*~ADAҥ.C ǿps 2@W[B{N4 3Sam> ӄ/l-\kxϓ`s8:aH:zHTt,Ꜷ$^:$f2@ @[OnB[ƵP! ԉVm9fña R0n@gj@uHtEBB CuDZeD"0 EZ}s$/~h?v|;C!sfCBii#T !DPmbdMM\>U-{1~!iV'tuIiC >QJCbl@Sg$)12p@CW+ulxɋ^W}w/~<(>s\wjN$Ncڊ/.3-Q3;)!+Ҫ/w6YK9Թm3 a,IX5EgBX3doERd˙qrxO3ES! uijAdA% kX%0/ [ ! M o3R6!g ЮzwM[ucyyaT%S mM=PUi6҉C;$5"*{AK@ƽ, c@}T*sQDь0#%(R[C4@ Dǰ*n3ppiGf I|-ȣ_=́ eTJmsyf鉡r478p0gO<''F1. uX1>G?koUV끼6K@( 7!TC} L7 w !CJu.1M 8I^-AC&*4f^ qM0]0uKk…b(kg-vO$(+B p[fkH T%9+:i7)C0t1Xd|a.>rQo?\/_&-8GRˈm,w2,f"w BSBlA5?z4ؓl3'_KoWwX(b"E|@'&JSȝsGKEi ݴb`o'ql}݊ř"%:U𠕀Bt X=pyf G{DQ7S.2Hm( Ngf R8P ( E,B !,\qrxҽ|}";c+K]ĝ: 5} BkR9^AE֎Fg.zczu\![ IDATO}^WwЌŊ ,C@dVD!#P2+DٗN[G&FnJ 9ZeBXΘRj%Pvo;Ÿ ^U>Gu?dvԔy\I-P+PuDcx Ȇ#m)%7F⸝ >z>S"e1R,QmqRfq0 v)P 'njYf @ѥABC(ށ!f-TD:ֺ0a= 4ș T&RY1i6VhdЌr5sȌ8ۉL>qLxˋ! HfFh`(kZ \^QE]2tL[rtTp㡢Ӗ:-\ c{kv4wh:-Xp.m=I㈂DVtĕW&mVX Y&a~PJ€ɸCM1̍/|u?tM BW˄ #L!fL|3(:ct/^=V{`ӌKڊ""q|M87'?|ɞh iaߛ,RXܗN@Lc3bm4IA[`$H$6}DZ3{Vc}rok`Zu.a\Ehy9`A ⫷" VpmDhlPvrgo gNѯ+xeEXYC_gKmrV udE'rIa"`fmٓ;k%w2yYYQgJ!E%k&XP,MR}MCQAmY\I(1!y&dxar7\PB%S'$Q+9+"_fne/ٰM^NR$sW8BxqBHD-5rTM͈/<IvZpINؠ-Q.u06#:𞄒BjxTTfAU17#HQ߂/{FvT>X?oHIfVw!Q \I [5fD.!h۠Kk,o 8ʌEŒ"H,L&eFH ;ti2K?f1erS lez #)7ty=>O::G.T$VTo0W(J@Gĩm"K!<(uFb7^)}*KlۥYFձHx1JkiatUd:n J=YgVC!Ɍ ,X0kAihR.ޅ)A!PmD_u.9=i M04+#B%C̸N"q2ƻNZ{iax =J(ݏǞ'Zhޔ-)^8BX*-m$R!tVF r@ nEFq㷀0]xEp+U.NHs3Zn~H̑V pox*4Brݥw-.n)O%U[XDn+E-MG C0! T/XNU!B[T$WTjm,W{~R HP\{^t).x=B%jY-ݰڈV2BP~@Ivx ISj"K#Rb߻N[CqLV40EpE13E|^Hj@cEffu;#b4jdfsG5 ztXeo i&($}e￟ѹ׾)h+`X`pG#s d/~,$Q 4լzrS٢tQeI=q*5L.NkKTd'$E-0\>'%U %:n1 %Gxl)9F6fKTI$ \Et+>Ѻo(@< p,#QGHJ6wHx+%̵zc^_l[l˫/Yy.C{^0%wo: nK%M *-!2'L.@O:<=qeΟU|Ҽ#5>;-oh3*cdcB6$#J2T$¶hSX?S S‵ø-:-uNc M̍%cVm #Y+U61K+ejQGκI8sՊ4hnȥíŰĶԂ$PsF$P},0ao9m_{FNS'm(=:`-[9'ӳ}mNKM.qDEIިD/Cr]{G Tf.q0P$zk([PP-#V4sw1xZ }rzXI""[H',$ C÷`1 WhCB^7!c|\mE5ַߚJ#-c+Hs7}IkdK,\h1n 2§7|ѿ O`j*b.-mI^8ʌ=E+jl<^*͸K7y޳Od؄GK:&fJ3uu'65Nyim)OEנ2bGY Ƹuk9:ieVRxxҊMpq }ۂ4Ʀ:sܐ@9d13Rjc/]|XAUoNBb5os_΂e''ݎ|{oGS?aDBԨZY""ų4lEmE{g2~s/}w 7 n\bk`Z=&M* %B+mF)n+E 䩁hv6T4%4!$D[MϦLM9zww'Y<#b*2/ۣ&0c>Q ֭m5Hr4+_xWp!r"Yknqk68 +[ t{G.SY{/~GƐXrCfKϴdTQBeJFոnnS{<_x-:>_/A14mw ֑0wj=PP>^Pihh[|5t1#j TvCa~zzFP\ِH)pI(ǨѴ-3ꊎhj9kД9Ϗ$݆pMKsv9u*sӔRX}nxw/ǞGEU# | E$\ m# ĆaV ᨤ=1pB%]>W2g=FlH+0/Hm$4.f TJ=o0oAjC؄Y&M20PINJ bqsczu ^eAF"ϣNhXSZqdí%,+GwF[?FMN} {̶Ht K%KhKh#6ⵋFGT لem:a,td ꡬY^g_O^Mosl::3Sֶٌ6"y3lj sh81_lQ`q/3Xc}<?<@VRJb& D%6JLB 9' >Cs̫A b!Bb)+ Zqzwq'sCq[~*;Y *&$.ɸd*Xـ&.)&d}\6qtݸyí\;+60dy,%n8=|9ƍymԉT01.Yav69{( w#) ӲG<`'9[6$Y}hqGq\1Ϩ=MWvʊZV ݬ{ב?(%3.)??7^J3$r!yPK$·RrY^փ rڰ[$ex9Gϸ~&mօ  谶D@`Q(dٴ5KhÊz-V'Gܹd}{ŭCcBv9{Hʏ̕or*VǷF…K_%ϕ- l:MlNMਐ"x]+ ̔ au:$޵+Y#h"Ez4Էм u'v 0Ӥ+-4d)ˆiȞ3'A!j-t6#SChB{}M9fyMffSQ w,Y_㪲>Zs%GkK+cTIEbخI#DkaDžoIׯ=7:l_9wx>{w|ے KbT j,[h2ҜCQ9!KE K%:2K#\(NdmZRA#" -&d]Gz8Az}$oyǮ\̣.ѭo-W\TuMi<[kfmƐ=<.H5FI2Vʔ(ҵFr蓼q֚6;g{7WduF,ze'PN/Xt[vYM3-%UM]0_B.[s|g8 ;y9$ dsk@KI $TЉ" D`=x=mXx`xBtx`l iK[j JcqIIyO_Gwg>{3xr.xx+ 3L6 ;b*Q IDATRHa+i,l6NnCOr9YvYpo ..:Ϝb6=ߡ$E#Bc% p'Ts0V3&/ؐ%3ѧBV2AkÒ@k$HhT5\BmIkZ6I%3̣e;TBS!$!mb<ŲL ZסHO;:$ sׇ77s EQ'J,A]ih\qFT48'XAuQ\HaS _X&7ۈMܮ '+X C&oϐ6GK#<ȿbu1%I~%*=3uX^h >4Nq>|eڈS-['#}֌y_F4z1Їm;<r<6lOݞ}!u|~_fvgt;l.m8_[<Ç!ƦbMɩL UF u`:-uկ Ї˧Uz>n_ۼǩyD0)ZSlZs4Xx069nr:Fkx gJQJS7x|$% q>f:jYX "M\W{=q PČ&W ~tpYXs12T%ԍ<}^+oxYGjZ5le9)LyFj%nPPu M"")mxhЇ;32gE#I Uc$aڋ~+"M -+:`tHH+m$OH2CTPa aR&)ց 'TbP9k$vl+zF\3mЀ]Aڏ :sv|o?AYlZ@h% %i8e-V$- oyMc3QG=Z@҇Q@N3,ԎeUJҰn@J "qﲢɈѨS[#u4RVaHP0H1iQXI&IVI RGH %)9TǐV;6LAM`=_$8 0|BqCmj546 l9GKtWlQ Xk1PIIQY޸ߝ ^ob=Z}D5ف\xl;7^6ҔL#o oC{q6%4&HYi5jLss2ځ(9u ;SQ?{} y擦!i$-9YS 6bɨI>CL"Y|QSoPO06(:&Nذ9[bӚB|J n(*5-꧴ lsLnN?VDOsO]J9GI+#bRx4q~pv1_y6T"(`2eњiXu v&+I%Bg 0&! rqA򆬎mdK@7HQ0LS]`h1L.߃ʀ5lx"qbf/x NEPƼDYrx`}pAFו< 2ZBl#StM3Ȋ:vȂ/mXH8MJF5)9I(dF8HJ7qa~grI#Ԃ[CnH-hqP`sZTҼuḥgw7dTJ(c3e(DP)/9 5 UZ)ܦH/FdžyE_Q o%aa :͂W)} UL" [ #I%S_/·Pwgӎ5 ^8fy%ŋl[|W51>ja \FTFhdGB1 dew& eVBN,!u.V|9j< ( ҈Mo-H'94('Cfؠ$t:G94Id"/Bx'?dcT^99!:y2M֓|Cg"q~UBl*Ԕ:&d z:UQA3O4`)hdm"'YGxjXHW*{3 1MT謣TAl<> 2CC*E+.LAc"Ț %j$nQDǿO=%ݔǽ&L--pZ|aIҨnlqߙ(_VpC4^&2lv'ܒatpGc49TDJ*-{ j3 w U9dDI"p8hSۉc}ZTq˹FoC@݊"Ҝk6,,fB^ѡ#|YCV;t7\0]֑_K.I_QH}PTY?x6=a)'Hj3< naϾn ~s}͟z/?3_rw~{P~$pQq\S,1>H_̕$!]z \>V56~![DĊ`BPt\:NI2 "ܷYfa&GYIOB&ZP@Fã(慚6(ju뗖,ҒH+YFdm<&b@VABF*7q6ݸqsr3>=zqJ 8}nΐ䉆0ZPs`fzd{J .8RۇO̿]D;H0*Yýӗ̌Dn@'*6D|=/W^=$r kS0bfJays\|A5ȚBbq=jhԕLs抷ns~/>>:6n$ShA6\y6\u!;݂{v;zΞ;{>O=o/\pb'㻼rڣ% B// !c06X2zȭM]?{:m]Da>Y@-r7 >u$}*5kB{̹3%^7x3ѯ3LTn-968w\~[Xs7e{{gѺtrVU;´Z5DG~IC7,'mL1<T&!Y)C GYc}yI8&w~ſ8w(i˷D&>yMCI{xnQ)e;Gy+tn<'%o%EgX3IE tc̀ Ҵ@Ż 1IhHMMkqߥlge בv) ڑ^6֭@;s|W9Q[б~_jOux qɤ*4Q,hGe @UJtlNNo_ KǽeП+~O+]ayVJ73"LAGJrL-lencMU^˼Ru e7Wuh3D7 3VWA,E,~'\{ȭ$!=< T&97oεKkn_a[O@2o+uKS/VNl߳+lmoMj&P"pG1QBY8Zp)zkH8ՎVGF+:B R3M@*r_Dl"K?w; ;3OI+yNd$U AY/qЗ?r:}J6mip˟|/y[NTPKzul@N![$X1R[wO=Y˜Nu=w"J` hQI%$HJhNq$1Ud:q0M!X]#Z /5>%kef)^უÉC8qe"] hJ껞 >;zÿ?sͿG5?=L;凧'(:i\4&p-Y ʀ[ϽRg .ZxF5g<yYUN`g8X0;L*%एvOg|ChMԃ/q51JX +zFF+)jWjbݛ^^2*tbyvԘ%_#Fȥ!z%9,^cp뢊 ":RBkM/1A*ٙ%lf>7Ry:$ZUܙT-WV-XZZ# >{}NF5X;:iҮhg]TQ,WjF8IvqRlyJk~4"CFd@rD$4'gq+⤀|x6khE&9|<fL4|6M"\!t̊tvKjm=xr"qD &J1Q !Qz2o(nTZ4g -7xdJrS X 8)mŏ9s7{>u/)Odm;]޳ k_|POveŠ˞J M_ #9(!>oɤJpݥj~9R 04~pYܷ$<U?\S"#|kWj,XG$F zziېSbn]/+׋U3H)e0e+n!uJU(V3%W!Grط ̀q#NY V!xr5e9Tu >dg$%2pAFCp<᧖|x7$c _TUZ#+ÇVxAV<q6Zҽ_ !:+_)IY'ٿA+Lӕ+,8_Q=1g䖜=IV! gy\'9PWHyj:\"d*\/S)K0VU 07Bd"e IBbbT/A)bIB RK{88ă>J7<8I,OXZ*^2Ɉ_cnNsrb㎏_j'㮽NW~I;+~ d3goeJъ (!@EtSbZBG)7HzԖ]2 $lHK р릨Xq*KR&S3ˋP}&⥐C܂ؾg)/jF 8ROBɚpSS0?(?1?sW_Ė-'>] Ե7gGgxx`}Vc*Ê ?[c2FNI}OPמdMSR3s k48íHOTHPITb)r3˫-?W\&ԒHN̆- DzϾ?Z-BHڎ7oJB~`8崗v 7q&oRSݓkBSOD߳'ׁ9;2Sh\G.&y@: 0)dHEe;,:z%^vPAuZ;a‚~Xčf2bQo2mPTlldxhgi8'-Ppl8e3vkt=|5rUr%!+<9JCgu8 .:8 SD`+ JɰgrHT>rן5g#5ɕWuHLt}n,t$s 'Uݹ>Ϲ/5迖 M\~yp|O_Z's#ϺmfiqԦю,Yl,C 8QCjQb/ך_,bW.͠`_E6 yӥdS).@ έ@Jk,v^qt )9vKHά}fJXҩ D첑65hĐ0kdPu*ħoe5beWG"J jF–р}=}\GWQDed8c6C:mXv YIyFA2JVA d\)5NaF .P0&)=r|/5W*02>lh.^9k> 4hm:Y'p^[xKvd&B.K$J1wV˦R/AW;Jh*+,4,=dd#7eU-O'k%s[_ w͛iKbrtҦ)48`@W .5h2Oy85^a<GՀsv`a;>UI{9 '?Lw뷜yg^C8yIì]&kGPouQUIǓO.xhP0zÀ'3/C2mG>W?zdwkgV>Vߐ~AM! kk픓/6pY[W󤴊"2d[dHÁţs7hFy`&__wgv9KKGQx1N_{6XL;w?Yzzvk@i?`}~ti@3'ԅv׾'~M׾B+u ~Keo;TO|k+O͒_߽6ߟ[\u1^oC>]_84?nn&f؟_G8^_a}Z|U3!P_ߜOIENDB`3-3.11.1/doc/static/style.css000066400000000000000000000043601503346766200156640ustar00rootroot00000000000000body { margin-left: 10%; margin-right: 10%; margin-top: 2em; font-family: sans-serif; } h1 { color: #000088 } h2 { font-size: 18px; color: #000088 } h3 { font-size: 15px; font-weight: normal; color: #000088 } table { border: "10"; } hr { border-style: none; border-top: 1px solid #CCCCCC; margin-top: 10px; margin-bottom: 10px } a { color: #375EAB; text-decoration: none; } div { margin-left: 20px; margin-top: 10px; margin-bottom: 20px; } div#footer { color: #555555 } pre { margin-left: 50px; margin-top: 10px; margin-bottom: 20px; color: #000044; font-size: 12px; background-color: #EEEEFF; padding: 10px; } figure { margin-left: 2px; margin-top: 2px; margin-bottom: 2px; padding: 2px; } figcaption { color: #555555 } #header_img:hover { transform:scale(1.05); } select { font-family: monospace; white-space: pre; font-size: 1.2em; } .oldlink { font-size: 8px; } #deprecated-page { font-size: 16px; color: red; font-weight: bold; } #download-old { display: block; position: fixed; left: 0px; bottom: 0px; width: 10%; box-sizing: border-box; padding: 5px; margin: 0; text-align: center; } .api-section-link { padding-top: 0.2ex; padding-bottom: 0.2ex; padding-left: 10px } .api-container { display: flex; margin: 0px; padding: 0px; } .api-menu { float: left; display: block; width: 250px; background-color: #E0E0E0; padding: 20px; margin: 0px; } #api-to-top { margin-bottom: auto; position: sticky; top: 0; /* Sticks to the top of the viewport when scrolling */ background-color: #fff; padding: 10px; text-align: center; cursor: pointer; opacity: 0; border-radius: 0 0 10px 10px; transition: opacity 0.3s ease-in-out; } #api-to-top.visible { opacity: 1; } .api-to-top-arrow { font-size: 20px; /* Make the arrow slightly larger than text */ line-height: 1; vertical-align: middle; margin: 0 5px; /* Add spacing around the arrow */ } .api-content { float: left; flex: 1; min-width: 400px" } .api-entry { display:inline; margin: 0px; }3-3.11.1/doc/static/web1.png000066400000000000000000004240231503346766200153600ustar00rootroot00000000000000PNG  IHDRL9sBIT|d pHYs tEXtSoftwarewww.inkscape.org< IDATx}x\}?fF_lٖeacqMݔ7:.ǥ4-7<4ɡ M P\JS$W)u) c!me[F̞׺ޣї-ٲe[y={{hRZ4J)5<A΀fgA4LT{) Lknjԅ}&<|-ݻwA9K.=-mݺND̓)Ӗx;^NX;&A8ϙ={Io<žOWȧ% d=pǒUV yFcc}QןL'x|:">% |Gol v \MHZE^.<ϔ'$SěHw4ٞLl}ѢE'** \@ݻ\UUuB&DΉ+ɖ%|"W.fj4;듨XAm:;;K˾)T*5d[:%9p!'d ¹x$<000Dq ֹk (-k)'BN"D4uO݉niiQH|Uee2ƨNdTWWJZ1Z[ZN^ CkJ=ϳAXuȀuǸk1*moovƌf4'ͬׯgjx؟'r)=<=Vz<)N]]]]J=gY:sNR}*Vk(Rpu]u]M!r) MȖo+0Z[ :cDZ}M&m:6Q͸$)^[WWgƛ>YZ:II+}֮];y\ 8O濼.bvKv>j[XXW_3 ;ޫP[7a39_;tU3ל_o_Cx{~0ق $ʆ& Xkx q(Һ8NIku>מ &D__ãι>JfD:h4Flii)Nzx͝;+ iN0 ɔG?QeXlِu^~oaȍ YQ~~E?cpa-[O KCo8yfxu|yvE :!oG7, ”ڀKm€8N)Ͱ1Fő}Wzi ϝ;'@KKKi.#ΆNI3f0d]ss33fhg;rJ U0 &T--=0+Q t\֬\/4tw kdf7nլ^e+>to9zBѬysH gc* 8n.IwIZ)@{彤d֭z̙ttt(uIZ%sGaxF:f[P__ϲe(d|Q(28_ ;_|cWs͚KЕTi@=}yʳ&M7ަn\f=aȜ%X7GÖ r8% 8kF)Qv]kmL[6htE:83UQQyAf͚EEC9fJz\P|^͘1r'PǙt4'Lq p>@!C8rW\;}[ǺujŻ8zGŽt`^ W[?KŬ:^: hYW||z[03BAq];$̰tyOPT\ '. jƌy<% tR8PZ٩0TIwm:SmP f `գb&{/AM-/K;Vk~iK~ `{7xa-l1Zs ͘üM<s/jyUmV}3WO4s93rJAqv1NGg'SVlQ)k-I::AI9C| 0w\l[[[Kؽ{Ztq̢קQ P((k<$EX|H7u똵(%M}e1TcUnb¹sz=<.]v|Jx'ͼ̨pmΦض%\ ӸA*t|~hk841({J8H<҈4t!G ܨO~$*oiA3(Wm.@k|Jk]Z9}%cAfΜ9cͦ4fuOT"Cvt  Lxj|I'@3_B2Li8'a͞Ք[<)Lt pF_K-u|8u+wzu:-#ZYYICBShAsR/'}GzC1 (<}_?`}}}Opd2xy.  IoY$X|IP9ۢ(R qC\U3rk:F)т p.GVIq.\J)Uz]>(4 p\M{G<1U٧  %T3ZEťTty;Z䛴6\7D N{lA3I[N&䛌9'hRpѐт ¹FYǫěl=׆1+Ot! gR{X-OSd8pRBֲ &cw p6P SS:&5oT켾wr!3V '"XA>r&ʜ9shjj:yAA%J彞ODٽ'ox{qq,Y28/yG]q֓~^+ JÇ(M&B¦(Om{Y|Ûv\@㭴砱(GWoӍ,ؾRDQ4)nuq~~^=<݂ Gu9Yg/1}rh+<ykrs|Xa\Z_yޗV8Tcv:\Sm'oˮ?ăr۸<p.^Jp˃wMkg|! +n}|Cz%_}C4+pמV̦,G7 7_0#W*ddEM7qYHK}ț=KiVXȊ[sƘ:Oе(g ){?nF)®j.]:X_s nqKa"\B*CֹEO0Ŵ>MiJs9NE˲&O}!9qN;ݼ.mv7Wаb- GFI6_̖?FOY[n=sl GQ4nLVTKKl޼eGc9޼y37nivA .t‹y~׏ Rx7އ($/A+_yh=ˁ7fƍ+ܟ:ADhRŠ6hcm1杉avn޼dzvynjj;䪫U"aA& ^›?q'n\AU K*`C^#p ix[ՓeѨwQd:冬;Hׯ{#|5{O!;OvmtyۥED`(~O9>ھ{tԧ>Zt%n;|%=0??u{[|??4m*4Ҳ%)4}er,*f~3-\x=\a .n nw׬h]+uk Ŗ|o0e' &uV,.Kd'A .x 5|?bF?LAH+n߱͗]Szzªu^|5\@5ܽu ]гn!֭s]F`0&y/e3ˤmʎW“ANi![?3<͒E }_3U-'sXTNj>QٱRƣuܚ-=A ime]uwZ3HMd, o1iDzֲpI۟ ӗ%sŋx⩮SAe.q‚ p"A)B$, SHXA  L"aAA"D‚ 0EAa 7ɜ IDAT!A)B$, SHXA +/_cn?uk# KG]wOY p.!hAA"D!h`sLuA戄C"᠂U03-_Ai>K=Ʊ#mKctPS7kboK:9CcS_] <M\T\iH,1gns6X>~W9o*kO_[DyKf|퉯T]i4pH|~0l}%8oj&?EY=rxX$|G6~7sl4ߦj|5r]7 O$I0,`IL}=u[nH[ tXanՙi#,sQW[7d@M]90M,l_۵,X~3 o7w6~o`ŭ[ jzf#݄߳}+hA-_׳\x{릺 \2t:DzaBZMƿOol/8!֮X;A ^`C .llGWeٮ '>w3IYAN[YbڞGڳzܽٱ󼰧k' GLΝ{6 {G6PTUigmwcTqT,XCVQKap klPQq{F> p~0m%+b pJVvwMt=-kkʻujhlQ #t  L"aAA"D‚ 0EL6aarik=1:\jY HX8meQ޹,FAт 0EAat0}0=|K_G}ŗrYk9#<|Znp)v1]kOmAp ;صSXQ0y/\ה7MUUp- |>^7,sgUS]sA.PDgCޘ*z:X|KLd9ԗ96rI-Wy  FbU3_Ų 3RwA 94(Ls>rT^u{rVl˸ᓿöE=/]5յf&dwfl3Z[u7~~ܪ9-5usT{KKJ{)3vRPmmf}G3ֳ瑛ࡻ?^AN%a÷4|d]]x8oz5ӬמLJZ67i=<u-MF JcRLStt+oY|)pm[ë`Р5{ phZLڇV?ƾ 2$X&x}-+W|tv/mgX4WV3|oyW G~2xx-4sϊ-[h8/x=ȿa>uӢ!;eӆETn.ÎU~?Ku>e Sqnֻv:ʦͬl6в*Vḽ_c+nY8յfI]'>9>eH?R~bӰv2һ^"[>u+.usC]nnNn8aZ2íkϣ+AQ%U3CU4-giC?{x ;F=7^n%vv9EHA8^p|j+7Qհ+~mm/ӵλ~fzGcZɆ^onO=kc1]g& K@y#O~~[q!yf^jc[yMӰ7wuҫy^q52] )0т p!A)B$, SĴk_Zqu;XI  [w L9s:]";~]ANAa w? ]S]A"a6vQzLנV<\&,{^U-vrt&LrA#mgz:ۧ5=d*39sOeꪐC;l|β]{x\AvW?ky#MVAD" gox#aFszo//ҜviX. Ms;uRT[]mESK!QUK.)={)S+/x~ZO!08}?3wzfGXnG5~m,k!<>>uf/kU;^b"nF;SvȡkfQS7l c;c-/=l i\yA0,6ds9tU4 ˮ {i˶gg"O~{Ȟm|,?.b/pԣ2̒ff!̑'\0frٮ,  p!3"a]/#<έ[y>oSao8tt-{Uo=Os>C|\wxu_-l )ӃY̪Mzٝ{~uxU\u&Z4͗/_?27m\5Ajpv/t/gB4M 1{WTG A@OZw9l_Znk+7Yˮk6YBa2^n]~{~q6p.moݟ{o<43ܴ,ٲ]-]f]?^zgaNA3bt7,k}ߛM{+nk"]~V8 7`I#m y+\v}7۟##h_v{9oS*v1 _L/ pߌ a誧~]6~'^o63Ǖt \PLt  CAa !A)B$, SHXA  L"aAA"D‚ 0EAa !A)B$, SHXA  L"aAA"D‚ 0EAapS7X3`|;`(>,`5X[\F+TlEUWUgW#wSm r*Vh@al/x@ʂSP 8/z7J H),ME 8.sB`|B[<!!(k}l"K-HEȢ\*6ߧw"ՇU9U=Ĩey~_myU6?TCv/y`N*_XToieQw룸\w%GuoTZQ(+[(6.',s&^wΆACE* Ah)}OkU('EYXea:U AmR\eӏU!k nTcӨRAi%Y4qC7f[af4,V7~)Q`E,UԲ 7N8Vɇb!uA55(jR ^ xM.uwEۑ-PXewT=z tkyC2T,m4.=?K(\mQ:\~.Zp8ҳE+pB1k:6j!0CI JT)( &PFc6æ>fЯ-^r9G c;(iĄY$%Eh#kq>zVE>CHBޢ`bhDwc#^1R* p/F(S\3^&̾U`ZU(`[8'[ h(d 68c7Q!t9Ư%rұ `@tS a2";Y׊K\P#XjWIA*i~X*LI噎5֦x9=U!bPX q qTj(`vKF,yRoE\oO+bFSW(hE)@:b9Ghr*Ci|bN%-U (llrSS&aCGpq#Uri[82:Γ:8K*-ڪ HٟЃj '˴@\FQyTYQ/şӞi-ዻb_M3.R8}sJ SRˠ㝃!ʥҪ%@`;#2qJ6x$RTuP*,E^µ)sZVE*kqexjN;UhkX0:T)x3:"" gzl,_ϽGVzPXTkBB.826 p^)!%%2X҇`,9KDqI٢xQ`$C\( u1Q~--+!6: 8y,9t%TVҕ:c0?G.Ħxt/#?&ƩYD: !2JK%&h5԰飇#hwCY tHZ0XPdդWhs35I)(E]6ž%RY˨Ea IsCL>TmhNW8RPb 9m>LMD1"MRI6Zކ-}NH(  3ۣOcBmR@KORcI~= ^" 1A'̬-F.^?',Cw7G<o¬Y}Xc@{\_/OI(K5Ř-c,x?;Z[<*\U\vBSk HK-(~c?*j-XE,EYZtkKiÛG!7cf!m-Z޷rtgҸ͚Qܮ UT=i3)y(gl}, (39л+O LEb?SGGn&Eɲ;u5}E#a4C39R0r8}ᣩA1qM|~ :GcKYuDYT6¡v_97,ö,aR qІIK`R 3/㶤^ hPuU IDAT2rz>uJYZΞVr4q֜jf8{aN[(V\AO^> X1obE­#~}ugݷ,8{;*`N$,CMsէ!c[t[GčThtI`Di&舀Yz:I+#y 5=\yJS᫗`Kmpwe$Ӯ##\WϑlE7Aki^;C#,oMxɄl]-Qf9V:0nVMGRQFڞ'9ҭ= \\Cm6UU  7geWuȂW-&%Si p5o9NIh{ZTX5f/Zl7/^?JE4PuP=됅|$#/`$`Fa4N z5.Gq<\ yL؍1hT@wXK0gH'0AT8 20!Jbmx#&v]s/qWؒ̏)lH`vs?ahj(`CSasvb#᜾By=,|V K1d nCf}kmw\gÀނBYt$ .wXԁ5䰆,)&, ]2186PpzEoS~7Ns8?頋`Y:kiЪ|eQJ {ePNOf`VܟgY"#,VkFcx255҅FvD-l[ "HYÔ|E;Tx\9$H$V P{P_)1#6-s5:nfr $*" |bvn!XBђC-L~Zu[@g6JK]=Wft}]^ࡪ|ω+<񢂗Axief^xr 쬂}ڿ'sQ"@Xiu$#kUT| gn]G6:B&\A^權n&x`yQxtdp`AM7[010pPYnE1Rg~:Uzݷ\W,?! {<2R ^j|%cHV-\ r2"S7F=l* dȘd]N u cƥT!#s w[-`.tJ>R(R77*F_5sOj:#/2RZ#neNaB0W*BΕ ж\nI=Լi&칷y}JePތomE5h6u 8{^- H89W\Y8KIЫנpb~ExO M@"!뤥^n`Pw"@na L' as B0:jN"Gv$7h 8&hڗhlcMaeKײp5Y@P%0,iE "=s-YXpC$bvf"b!jA‹)oN#XYCkrצ$2tFQgQwW+iCXeXP΁+T'84'hi:EC@5W\uWaUjhu*T+a)љ \TJ1Ɠ\k `bf- `"@zbcXy a"c!hЈxpn4@v%?LCh_`c[v}o0^k#l[v8xcsГ7]EYo,clY1T|"]͂'9eMFXaG/2VˆiG. I'#<,5a!}`p7?Ř VdJ`z*/2Fz`nrB Uptd1$^j-X GA4\"x[FxCƺm7C5kt2`m]LBp>H*W҈k= LKl 'i ҍP#h޹$w =+J#`8R=>f +d TE}גVdTXtQ ڽ iZ(&)X?b:X1(24?Bg\'!]7ZMrQnTgdy.. {ZQledE?qRݮٜy;9&:۳1YJd`Q[.tY}{wd9/\!NH$YMz`H#>ÉpdZ9Ͳb>k[I8  &Qŀr9* @]|îDݾVx(S! mVi]l4R˙fW\ZB215kaH-7Uۑ(pxD{Ԋq!JR$#ݫ$ -eQ<,sI`8`/g ⥯"KTB眩PɈ6^^5t5Ç`n^u t֢~jriMwJ4-^yHXĪIp h"!Cš&^TI1>LcQCdp|'IY$[ӳ\``us +`H u7b߶Xe7/3ҿB޳d֡Ft&V$i v-x&Ŭ|*kħu0c ,:Tb:H.Q Cq֎Pg%LE[Kpg,'mtu^>]4 p/?PWwWhw]vP!Vk Bh$zs %`}$)kKNlM0t4݊Y$޸ߺ"za0T$h<5eMa+"5ùx,bJX), a9"rThCFA7 њ 0qө$FR0 t ,ixEHu>x N \Srr di?O+z zwQvK#F=$x&R}z\(Z+ōֵI&[ͪ%80$3 R{TzPm=j^(n?zG$BSKx~Rԡ/<%H`ToVzJGAA6\;_^4AN7Z;qVq ^lg3mд8#oN]4T+ zFv9>(5W3}rr+(9h +o+Z^2W*r0Am˜S_ptN3Nc/5W 9@Z)#^Gܳ##ۄ5iPz[Uuk'K׺6'3m,OS h\3{Knbk2(I2.+syX1t߳PB0,. bs)W8'c,Cʒd:@@nwR$P$I}0{Q Y7 )b""t‹ڷ0p9b*[7E{LKN <3G~ ^sEe+ڗ _~ gR8t/~Va 'W$W4WPa<#|τǿB@&QN2OAS$,+ IY Б;<ˈBۿm.FՌ=YtZf6k%$ j0džj &n[;/F0{\ [:j]o] F l|\wsfk Bvh>dXF.o)c;Qڏ.) ):e& b_*.}e {"7}\^dTxj._pX]2013rrLl/+xЌ}Lj2j,!_IR3kL RӚ_/{ߴ8)lqa0^k#,Ps4l9ѫ&?_3lЄVpy6WXͅ;yl4"8L\Bl'(9YG?_ ]FC}_1;+?_AŇLD%-#>:"Mpj =qG ,*2d~{ B%h4e0#Hmն٫(% :@k܂,#r&hV Q:9GHǎAj,=;Zz234 %d;H9^Wf.设3dd =ts/?Y.Oh/ -1X*T.Xf'7.heNWa={=ĨY7!d8rfSA%pĎE&ؘM5GmP㤯kGGs$ ) txSĆ+b %z6%!?~ptT2eыdpјw 8"ܣR̀W]OdѾvA1(~3NNw(iz%!2_߆UшnϽ c[M\#gK ]9b_ (*>|nB+%;8ts?DZ1܁PVBexv;?Ac<~Aρ)v>j$l>;8q,Kˏ$`M?$T'`2(vp;71?%TclN>Cܼ۟嚠]1chOԽ= aQ|G0b _FO 1Z, 9k#:B"Ur= gWәwp% ~)6qm݌ YXn.=?Pg5]*QB0ͩ ~Cj^4 'HgY!O6) _;FG:_o<ٴ PbG|OtzT`@q8ɜ?{`Ek{HxbO`>Ԧ"}m7_T9Cj %e}>;$)!8ukfMLvص̺'丈A1'`去d9!8* :~߶_Rrޝr*=N.~IM`ƊϚG ˛L үknxϗ. O2boc} ]HG RprZ^uq+"t,OPXv7ZCuIkG<~xhls!υ,9?t.>p&޾{+_21'm4Əw& HW{> xZ2`dteF]borMZOlhO!& 7I85)Ǹ;sJZĜU2Sѽf|m&09vNs$?DJ"!+7o%=C&t/;;)b4Oax\RF8#>B+@y`5Zd;SN\EGfjs1yoph?o 5A#]/Ⱐ%<tUw dTk:nqLs=+{%a7Zc9xOV ]~!ȏxN/GWo Zs7/ѿ٫dOS */*~`KcUVp?1LE@-/^qspcaUA}7/8ݩ?1=Pre&X:9⌊z vE?0i IDATbq d1L:ce>Sbݵz z1# X$C< m;o?a^12;8qmQv¶n]ا/РŔC#ڗ&ã8 uZn(uۻ &xW360lh"XcIT8 Ȇ+ҡل0@0(8 i)4S7sns1Ա%a3| (nC8ˎsosڬMeb ђ? s|bcN枈g5ZnZ=eWmu[W-4P/X2Q Q w;Ar 9]X(i,~Xݖ*q5AXж1ϝVS0P\ ݃%]XaxM)7e?0]p1=Lbr?<]~^C#f t ސFMx<40\;k,6QdMݯ(G,Y̻!I96鏻Pƴ (q)[K Aqf /g%+`m_h= ve3V>uu_\57En v{!mUw*8kxﭒW+.]$=[2&5YHmoRSjm"ZNFKHox!tL"A 9}[SHj) Cnxz`EPȄ\tF f1$6[L# 97A/޲& QM5tJ[eAYaOcQI+O1 ,3o4QܮE&^)f(FԤȶoI*˱;_xsBkHW *3}8GF%ŏn *3Čo^j)oq:L)Ĩ]'$}{G[ݴ31^V"gq`S\w$iNUdХY ,(& !,#C F8 =QuHH݂o+o9L8&: wިƘY6;D;vg߂Lk3Y-ScfW僚7nLW[gZw?xԒ&3P=6+cRI음/8C,jp`pCA,n@@nM=*H` [վh"!A6Ca4 b HTv Mhc,'lyY-ZHX`"=!WF̀B)0Qd{)_!KMx(7vP-=pdP *V"̰\r3Wf]ؒkS8/RPQ:x]+mU2q]{w"CLhT_kw-.p#MKLe,7RS;$WB4[E;嘩L,uLX227r2[b:}es 8W K=Aڴ|j L02}Ў;3Dic7¿ ̟(#"u9 \hXTe/Q1C$.J}+0rWM2h/Z w&֝YG[װ~Op>OilG=*$xrWsgGO9`6?슽iFY:\+Lice$?0-p[Rv;hem=mudK$hJ`:/1ђ6OeZjP6#Hyl^yi^&{` C G%/uòjUaZF@c ҷ$ߺrFfQ4n:y^o[szl>/%꒑_[oG1(Ӏݭ13|t7-+7r 6Psc\:\sN+񜋲{˲ +ҠsӼih OU)*iR r=f+lM>ƄlE)!+nLyKML'1/t43$[ĭ؆+o12оsPZLC+"*kя-mS76R.o>* *V2q 8Vb $* k7B/PEcGļ9 04|pt[ ZE_xf1]j .P1N.~UW4D׆Y8ggƒ'#]7TBxDz+"L<V  D7%Eʂn[KфEH$˒gzFއP\p/ H*oI<+${E>µʹk>o^o2"_WMY|AcE {`E1nO-7w,Mi?6\&2bȊ+$BW382;{ 2Ը!E ' 4dZ2][k6^k# dñd}pHPLH^}T3N-_?Z? mI7"75ӒpIPV*8T|9ՀEN UNTH<OiABs[xC9 Qt#nsA&;JzKi?Ѫ(ˀ#ty4!1fv3D HW/ (4Jƣ  w8FER׬(Y9ՋaG/af;'n.C #A c%]c Oh ?nzi?09/kdPFeY\Uw.Q R7ٿ,Zcc'K8}rJZ,M"KO󴣼Le%.av$I3飯1g#g%n4EA̾þ\ʌV h4,gdpwcg/#}*Mq s s3U=?gʱf9VfOG-pNcW=Y:m׌#oW+( B3 sړߨQx0an%$+UR=2.󼤕6 Q,vLEehz 8E!3%Aq|:uiwhL :f΀iɓR-5#B`)'Zm"kwL]2`C680Svd(qj^ǔQ<՟q3> Oc[4 K-bc9xw*'KuV,g_>O'-?5o|`s['45-goθeL,=n:$Ŵh9kv0 -5agw݉< Y3>=c,p(ΰ4mQs+Æ4ȮE<w1l(떳"{@ĊLM% @.fc&9ʏIT%]$NYKHs;)󝒾pbP0Yrky =V6Sc xۧ` h>m"_licI1 3`L;ʵG̞0c  LC-1sR V[$$\r7Цѡ]FDcnskjr bIꌒ%%yM{]oPK#4WJ# |͍8j~,g[GiuKxD V2,YL]I0 "/H$7$ olMmAvV^.>(>̈ {KPt'5]B0+c!ڞqH b5"ZFcjB @6aA0-O_䜐Cvpڿa|ۻr7;/KEGWsH2#+_;NN/Z8>7@lHZ NOw:.yP!#.;MxѼFo<@R80TEj84/U~GD97uK ٶnz_@a=0l:Zu٤Fle˲2Kl[>'Q +(^q45"RnP/#!s1j lɀTۄNkaуx]9w*BJu$ՔKBBRaM7C W-k+>k0CmX. |%e+9zUCe$5*\JvP9!n [nj7uZM×MC=mvM_}gƖ {>А02~*%?2THY&kjzY϶$g*7C>_gfԜB T-|e_cgd i!m3X?(&lq#?\{ڈrK4܎Ѱ^W+VYOna|u쏾+m(1{ >a7'~\ϓs;dL?9ߝ eԲ>W5zrSP_1no&|k` i #<:9;7x6G5 xmKV$}~G=`L`_/T98Nf 1rN{"{7']O"`ijww̋=Pl䌮8O1꓆d3VP][ 2_ÌrXF{ XOR9xɖ'K~]|Ґ)rRHֺJF9ͽr@˾Zd0hnOh¦wBdWh0FsLyG|Ie;a-uoI-Lb:TFì:]]-jng ]sF7!9v=/]"N.׹c.ar#x^M:zu|Ȥx -_quϱ=R,udX:/O|df!xi %|9G-(L&15J/XZ-霧k4`k]dOhڜ{<YGRa hl9xK'OI3FFyսv-pIEhb] IDAT {%w\ U_$5GpL=% <^[6GT[]?H̫}4;86%W8Q%jP!7\ : CaM^lZ&bG`bf߬s2١mwDk4H>huHVSoP3mt]m>\ $$|f9mfir-,)0r-`-wi_g#kѬj\˿9J$0)5~c+VŽB;Wr/* UG.<;GO*r2!j<,}Nc$<*46ۊkEÍ]ǝ;9orfUr0<3%I2G#/ـ 06!}r8mLS؋jz|Z〾ٌRt%?K霁 mZYÀLs 5FJJHɩoG^<[f%GE |2Y`ZtNޘ=Cꜳ}WQe$&Mr .8X~)$"}ҙ p+}q:+p 9m$l~̺b}ҭ] @nKl!گ9Õ+#' dKS[T>>̰D‹6Yk&! MT*n>@0o3Nʵ=ǸV;J͔u EF:J؃>8>lN[Uw}(XcNX9= l'l[$T,&8<;;9 (EM-ͩXU! FUw\hQPJ !Fw5bYɸ{~sB0%owB5 ٿpD(CB'֡*#A]$oAY$90&I|i;9?J[a= О^L ꣳe:V:6% -{$Fkg6\`qӠ- $ȑsL~^G0+yCG> _nacȷ8*$L3Ca CX"א-s٣izPAO:\mN)CSxұk-{.^з>XZ6|/ 3a{bbOj$iZ s!.U*ͩ hfئmX||ϖ4l$, js /D噫ȡ`;cf J$t75vGý*/I ` ~ cXcV|6uX211 oRL_}eu)v',fECkvvxGcH .eEE,D~wюnU1{wh}!eQ=+lI/@PiQAO7{BAA}-y,8ZI5DAuo ;I?}l;z; %HTSC&R[|h޶a(Ydu-a(6 ^#4mdbB0asc껲$rRkv!a|xCQ>M]Kh[okdў7bൡs`v4POLdtπ,ʸ!%Z1B9EVCȐRǶ'9SaՔ]l6؅;R"*V O;lOjH9-@ Q $5Đ^l^@kg )-KPK`+Bi̜`H,ۅ|=CJm]6FE(67y+>Ξwݱ[)P a2Ȑ93{Nk.U@FUȮ%ݾܖxs}k15rcyxW4#z*RB(~i}$ĉb=QZjŷ$$ w Vۡ UY ]s{b*MhyF_50AJ~n%=KKݾ7`{K2RXZ YaĿc.cKD:VlSbۊ5°fat}c-\UbD G.\PpC؇2);Yvz %*nρsJZ#Ki@c}|KlmK!Mٴ7Xء'dj^f j\ځ4m"IwV7a֍,v͖?ť&5!e]ԫlЄH{"]ArJi2Žz4=?g/Nq.s6ڑ-!'%grwXWA-g_0 GHݱ:V<2<%w j->īhM153NJBĄa1}3 @f _]DU7 ) bR L =.,rdKhK]yu 40JZ6C80l4zYPۨ\3n+t14k2*CEp2QpF `GxBx3w"o yBٌ[c*t˂dcYd)G\W_S{7Ŝ=msó%+=T"Sr[Lo<}. jXh]uwe1+(1wEz0\Cx#AʕKɸNE9aMDm2\OZ)Ax0|E+L[הD+k@ӞT.?D`ZïTd$T20UN1S4LQRKhx3pyPwy*$ &<7` }n+tDž4i5c"aR-"G_֟CO#~4c$Zw-H@ڄ]jImW 3˃37-6]K{G!tM !!!J 8tє(Y[8]~C"NFUJY1B9pNpL&Eٝ]!bg?F%u1` Hr kڣ#VGcրݖNEz)=EgbƏ]m :nE.C8ڐq4i{vviDs$xCP]oF#JT}S(BcF#(-w  9>珧~'|,0(73\M'$(Nu!:_GrмlH[JIWc>-d;H!8:xA`P@Huqiae< 8#  85B4PL8#U}8b;4NVyw]=XW+ҶxMc07W 9!&8's26nu\4 Rtk#ݡ S)2(L!+q(myYW{c^egf~+!_HxZ$EP׊`+iAӪqt{jEC^7wogv Wq j#a{CTP,g/_z> \pb/kfq;+c5NG]MXԑ!"dGrAaۜBwB}ۭz%rXPUógst7v\@dCxqx[SW8Cȭp8AtX8LׁgQwT,)Jf'00ѠMSXhKCS M%̦0ԟ_\ hyֵNThN2< ]C@$CovU).İg E l.%օUqR$7la<*>n E,^ز*X '1|7E Nfz4|l*v21٭lvKD[T 9zxLr뚲;. EhCg_<}Qz4֝MEm:hJ-ݯy_wS֠%'Tc ʦ*]}jMd-G 0-y+j^[ո2R0>M pC׵ aΣzJO[)[ZhߠI4n`֯q >D| IWcFc\ \;(#,dxRax3H?NA+BH8Ó-LJ^HS*\N=38yg u#u i_iE0]ٶ` d)X'լ̶Qs!&Ɛ -4pVILτCe*P`-w8c0V=:iIV{Rz'3%Y"4Gy`|E{9mDR3{2ޱ4., d?z47-ehMtrv!FI2x%- C] M:p`=ĥD#(~!Pw#`R+,9&iIx>2wu8"lB \aCvcpBХ~`I}5{-'z4L R1n$23>5qnv=jG-iQΩyXà~)-_sf {'[[[J6l1.BHx~bgdH8fRm],U[/9<~'l8E#|%<~^qpԒ:O6X/fHKF'ϳW%gOh jUۆ&t0;׀w_7=L!1{8Z"fvBtFI7=#haƆ A"׷8Ų#!{ Wi5hR Rg`3L>$7 #ѱ_E}gHCׯY/ AuY;=d^bW͔N؎%5# cr SL -l{tzIʫ}Ã(Ǵ-<5rtk}lؚ*(ELx5Uı3h;¨l)3kp |LsQ#x zjLKIfX|k\Z,C2VV 8F$lzqD/ N&+ @C$Aaj:d 3FWvZ8 ~dq wcޜs+B!%2o֝^l8;rpP M2>75KE[#_cɏgQNMQi+G'%a2+R0$2`$$\h(9 fnvg>ScL.~}qi JB$]LJQ'ߴ!!olI23jY (w*^s?󆟆 6ZNyg/Fǯh@ ׮%dr/Q^Ӵ_GG3>ȚNč5'蓆 ')y]af٦aph8ePئB6p?Yqd'}!>S]Cz)ܶ\77UY)O!+9=\IXɒ1 Юb+)vL}+K(!d>BAY %AFO THu@bbsH1]}7:-}vy cF]\/K.]US2 c+RwW.BxqE;RO1 I/h&s.lFDHMΦy'#-oBj.ol@Ք$̗/+xv쥼3,X+}|˵"[Sט+n}1.6X2h Ė+B^tCEfϹ;YjiWEg((.PG-{!]xl:yĖJ+5::6m7}tc 1W=h"*c0!Rr:fYƖ}i%k& mE 0Yl,xܜdRL[p˦S8ׂ&d2ƪzza#fV] iYb;V!窹;X`aP;p ^_k#T"9*_!^\uRJ(ejA}خ`Lx%0;𥅈-~C<,y8_tez)/^T\:%miNuLw@e2$"\o3#[؄rqDȎRJӊ;cqє^=o_ IHFPY 8-@2&sc򈃳dY xݬ>m7'g|4(%ų3=mtVVI_0g~m8vaز`ꮍB.{~[O@.@2i:cl_+Hlމ wl[]ﻴzs8 ?ohv^'P[&!R>>Ovq528{ bJ+& nKyTѠEօTnBQas[ qBۖdqvp{)NB[4!aNh<}0*Jvo}4xil؄FmI1vbc2aw'2R1bTls `o;`CŽ\EBz(׉sĆ.&rm[o!Aê$K5:w*& mb|0nzr#hF$6Ȯ#?3"5# Ɨ yc +eܘvEG56ogjZb?joc f JOn=~@0QήsZ5( uJ %B^a]Ʊp*Bh ~ "G!E%`E(!9*tQjA"d$WRnox<ra(<<;7 :qOa?޴IH{B713\.\pDRV#3 4gUWuݕwf~)?yDT5&f@n" WgK=_ eHΥH%K)" PDC{dI$z1]x*%&K[8|aU߬߬q{GLm @/r2ZSGRsu$p ңp IJePEi55( ty؆4oX#\jQ+ Զ= ,-trעFTaٰٽ +vntv&#ƟF&"d+L*zC,_Ac9\p#Qa\Yg̐3ʦLU 2"LIx~23`۬KT a4ySJ<5ǜG%gL<i4DT#\ǘ]B觞>ww[/ӯvP˲44 ec?Q~8zl 2eE94(g̅49BN:%(X_d͇6\܅=SKFX@ίMGx5_bb΍kouIMpB}15 qEa %BR\cCLȷiz3)#17c`+0ڲ&)U%gViEdɷZ\#9K:in-R*#PKec#/Ġj8lhT$Ib[B=6<{ &[͛0ds;;8ol%D(h9".2vLo^e|yOIũq,W'_ *ECJ]ڰfUz 1t/}TWcE A#\=c* ҒA(粕QݪHFO֋,,vT$[!A5ɱ1ҬQ$-v2C`0hIǑ^A#Sg\3DIg\gкE{+Kz_`ۿ@~ˏ)7$A7ƒSr4Y| <5r+on65aSˬ]^map_rѱ[ MZɹ%EVɓ1$2>)МeLmI0HRBz)=QV1MV{S}P65v=C)pcR>822\ V? E3j5Fҙʙe̓{!FmH07vTk\sLը@0SG,߁LE wJx(/ \(F6$ X#W Hh]=.Q9=K}M8/Z& ON f .cULUz`-" RK4T8/~qljzy3*L֨4cwy}1cOx^}mՑ$&cXΠ> (O=g\T%G/28~ D:8yur9>T,H[\LuJ EbqzyM}:ヌ80]8kVM옸si VG%kM#W"xrjJ^6x>_8X=x ,u ԗ`Fo[8 oȃR,6U^8]JƆ+Y'c`2Vἐafs+^bWgH0 oq(xj[):2d>ZVA(<ׇ[v.f#4 ({5/ޑڅdDs5\<( ^N"M^|UFZ'8 2~VVb R.y鬡+GWlRPf4&vY7p)rt!)6T_Z7IcaGG9WO=W=&`eecNLTXn8њLt>gq)v7ؤ;p@ 3f"z$} dTLd {-7GaTW?Q+-`bQrӚ"ϰH[喠[xPf2?,0,."m"|u͋W5+ɇE[ՠjڜq '5W']a,pf#]l1"\"JX2i*׾WIpdoPtF,#5 !Hbǰ4Et؞.:Jׂ >2'CAU.ٷ #R-Wǁ%8c,ŋk _ܯ N_qM/Z>qycg!M NxLa2Qf3O5**p"^I(Ғ{ B ݠqLdTNۚ 1\P2XK-&y.6L )̈GYfy6\(0dm=_+yoY5\ԓ5p,!:fӨXWmCr (Sk\PVtYڤoJAuпf1g0;%U=9P3?f!ꊉ4iè0f+{T;9v|dhO {-&4t A.,= N4?V%#2&S"xte*⛔s8<;z<$EV,W [8ޢh?Q[jf4vZDֆy .rG^U\~U~ mP<',Bwӊ/+n(օ:Rʤ\sAd'|rӄ9ds2~EVV~T2* ϊG CSzF;dwשcO_s g!#M6!d$PLWUnw4!E.|TPy-FgledŅ?w?ԝYO4TQ/v9SWF'jMKZfZB$ 2D|EMi!fwS0q3&g!%j$rO >>ET*lpuBfU[|d+W>nىICӣp K̹Qߖ|[m#??rkOt~OfX#"7{uw3 ?~;B^]BwHi24/9Ȅl` &(ϷVŴDᖊHbղ-\6jıZK,[J,j힂%6YQܷ&FfMo {T0@.<5LZDѷ'aU8!솤\Lx"TĜ~2CGD"V =Iҙ3OK[$$?VMI>tfNh7]ѿLX-rɷVicz _ل/ ~H˻y򾽁cKNK7m$ JMm0ZrfUGeC\>kP6%756LÄ.B4~ѓ LIKU2C񖳣 1}dIsyROi=߻cT %7ubvE4-YCaƎccA*co xl?QXamXVs8Xy\"M*+0>FuZr 6#bKZ\{ 1![jI_k% 1!Kd, rVzY)K_` ;-DB2BǭE]fX\OQr5(ĞjZsy֦&0ͫ~n.q#( ڈ%ZHECZ+q{/I 7݌k%u޻ojri^1rQ!8r}%Y`)j+]%o'3jvaJ&qއѩk%E0]&\n$e l*VQI#w]mL@TpzP̕ "le<͛sT`Z.^40,(]i-ٮlYM{3'[c IDATҪKQe LHQ2lSPa$}^5ڒkb9 &g]I;܍:v xɂ_zoUh # Xt-n(ֶl/3l4 6xFGWah;<  Cw?pXl]EZ8Wn<بa(j]>|ChED*QCޮEԬ+Y(Vˀ,ON=dQJIi]ļI!R&#a/4X; 늻=~زf<\ ˙BuUDVW_rocˊAS5ѩ4J~oY2>eأT](%)1n>ڥz>dtD@O b$Ag!ch4XNXh {/k$,j4sq} evF+#Fs~ ox0X·\3.N&4ܜ48*fF,Hꚛ¼ې&ճAhBYăі|UjJMSshO{\,VRm46viCo 9!ZތQ!CB9gHw0KGL ؆aBKf"c.e:Tӭ!:CDP]c6TX R^?b?eq`&z贡o o`dԸ6(psl_q;aլ%Ҷh<ʔ hٔn1EB<0.OR4%, "xi$}܌tq?Vq H2o5;G 0W<"bQyjgAMh"{f^yp弓 QLӒʔv)$i#\"xWrˆu"kO6$0qyIηbXM$y%N3$@$qG)!5*%y৖g=^sj[PKCc<J+ /L8=|H\Eה[JrX?Gk\-Yzzj[@0n/y,^tEix4sF"\vvƛp7 ຕFl,WAQ ty1mKeYE_@82j6*!_%pBPRɔ|^X`  L!( Ύl% N*teJW<}{-ua!oP(PN+*Nn2(>v)E&2<S5c/w-JblL6_ev"I!e5 9NjzN9B @8S(<0{{鴎_QM˨#^!NJX]Ȝx%{!wȩf?֏8(؎6!?В_1ي%؊3&ּ( <{6fc_ґ(Z./Ck/1)"HNӳ5]Z.^fN[*ؕ$+&L^Q*9qB2E4@RNAx)56ܒ\Iic0_s.c?h2`,WI±9ix^[E:yq3woCM]2qxYXa?PbuO>"" 04 x<ѓ7eHчcPdzϿݲHQwB^uB34 d@㩵fB6dxTZ%7}lbέSb^p(6$a5I}!_2%deJ0Z,9}hAQIaayJ&PY?#{_+O4-Cj,F>9y "E42g%HOyy cdaA`78'׻4@ C%3u- ĖuK`!Fq5OsW* &KpbKHS_7/yf*Rmy^f<1|:\1r)hdl!?¯N2J=946(g.B2.(͉#SI4%,Mt\1~01(5F)6rDZ7Q6snUK1\>mBFgBRPzC3a .>dĚ &%փj1ZZ;/joơ{ oy`jƴ"`i44x Y=32qohii8,yռi5cfܫSv)1 Ҹ#9e=v9-; oy$<ɄF2fyqJ*RS%X FR'bVr|?V+TP\$OY>frPe180CC2 1* ԛl>hubGj6Q:o0WќbCNШm"]d`PЧ&ǐcQ(uQ=ٕif." VDmB jx9dl`V@J0}E{cϸCm)7H-]O:KJ$'?l&Y u4㓚'Lj˝$6#KYT)ͅbkF)T>VJ18/x5sAK\sk>mUHra=eFMw[MحRk 5QĢF[JK߻G{fdxZv~1-.VU`vͺcxG#]s踅0g!tfYh[d3ߣ%.5hpK Ͼɋ1Z^Gpe˦)2' O Bx|2!@6Qy%vU1Q+Vi)+\"LQrsGoۛPs/eIA .ps*g\7 ,nң7տ0e˜YKjN@ c4arA.;zb1[8j#|f\Ȅ˪Kev@}p D4vi151XIjl#eH<O{ Ŭz4xy\]/c5-{u<=l8v kJqח_?ï 坜5inIGqyN۷|SmĠRo>FH55oFqG3&w26FExǁɤe:餥|ceZLfG\2mVo`} lRbR->S1(LC,*:C(Q򉿤0 V OOmPy. ްv[Qj :g%_^r.Yэ*RKV{}[Cs뜢9oA cNUMl\T+ˢ61!K0/~7mtYV̒h+ λf_T joB]c)}E<1UQ0NzSXc`W,iJaTqkù3q/m=Ǿ 'q쑥`(Ȩ13**c.~qnF4ܫ\5vol+ecJ; OrN9B?FeT3l w<9q|䲥nlوQ̵ r|aݽ+oo({'ޠOWBo *٨RRiL{GiFCP Sq$#Ÿ36DtDsŌ*ffqd aOcR.N VGewq~WKL=%1!+.Mdsx;i3n,8-l4ȭ5wc.|yӱQ6nJʼnS+>CUvw\&ӧ5m+!ek0ZKӊq6T;>&`/}:TK\seEV֙ZcFǻ@ 5ߺ pdjΛg~Jc6֮ca>m֨ +%knv@͉ 0y0% ϋK}.b}0}>#}4ɼxL$̥@Y?#'Ϛh10c(fpy{~H]n:/yW_UKivWٴ 7(W0MJ\q" ֳs0~OO8*7*Q s1 G'/5gD9̝U cZ@PdPMc;k9ckC= kh1KelqɄ[sa&^ )|{8)p(( 1iU,^1 c@!kĒoXd'xbLT b#ͭFVH)vy9%jy6kcsQu|>ͽA)ckh% =ݯyB}شO(iha,5(e Z{o0g;3&5&feX30pbVpj)Bê-(59 V M|; 6x55TRhD`b3֭12+&Z1kǜ)f#%5P1jG\C_1PIe0S\L+X+~_OOR}T,q~>?;&}@#A3,/jG$ѽ7.6X}%ˣ sNW .5U3SOk -:\U.'4* Z 6y-8lpȈm4S/y!$VB2WMQs͂e%V:fɆ I٘99Z+i;~zn;^ē 4Bs\^9|Gw3pI WW5zZ!}>D2=kUz{1 Kr BOAԱ.{L\I{\h^:5\MxF c׮,\k.ӗ]%REĚK_v1Zj6D]S `vhb#DU]vl[|)懎ݨm>;p2Բޛ_@ʍ[=lv,w>W5Oga?Oޜ/`{٪c2nS5%bFnA!k؋:avǂAXLJWXҔȢc[VQ 9q|WK7_B1Xg@eR \6lɊvmEN˾){4KoJ6QD%OC;$-< %C;Rפ1 gÛ$4' y.E&C70T*C83'P[B49hqQ .?1Mڼ&߀v:/ 769ĝosrj,Hr)sͿ`>]TDCsx:b$ 9 U7" W?MuK/kMIpjqMu`;qRq1y)q6£`V {JdsNΆa& '3|t J;]:[LZF~>?(+4:\ 1-i 24,xb^b +KH2 :f) !.E2 ~X%`pf 3B*S }V|4DK~# 0UՃNɭ}xC9L1wJKNj>aIM})'&fadh spg\;,0YP'yA+WouW8ZRIRuCee@HjbQMneS%bb`lK}}H&!0\sL Y@P|fɦ&c}e ֜{Q͒^ @FqR4lJ(PWIH]ou/\ȴ!տ&]?oXpBzzGMKUҽJɿl3 X!:|Mxs˘ MED\i0zJ5bJ_|1/s]>vZ7~s>22޲F/] ܒ%PhoPmRp9DahLr~Κ WǍ?֪BJL1hAeZ~(kGɑFAi;pi`u w] mpJT$ɭ>}0P1HzD?d0e%hMEa=2 $*se5G_}XH9۷2F&g/YbHÈٍTk:>1G5!n!BtfͷX 4Jlimd}Wƃlo[ )kPؾw~0\I".H[2Z IDAT"`w{ ~ N=d[4 Y]nr9'GcnKˑr;P˖Api3!dҖ5 ͐}sۥ|{lLder=o; Ij ~ Vc\m J ^*_"BĢ&>t/#hKzW(cԳnzx?s݊Гv)o ݱPM]M#<x/jSSGO4%⯌ r]JQpaRҡCM@BAB2X&Γͥ%6A{Z:&ADc1q N-BȶL!{^9g(+3RtF.{5oբt=Eez~RKu"7v!kORp;n<)sbqSz|WV}:|HH R.)uB*`6 ,d )yw&RDL'޾p״?T?+ټi9iHyKueatя! [v֡ζx drP2 TS/,Һ@iSmǏJb#ghEMǽ9)с:ē+܁͵00,N"*% o৷=.KN(qJmkxQh섒 g:jvsnMlJ8.o=^D6 Ѓ]&Veɐ,iz0Mr^u.vs\Pr]z覝?ѕޜ#APV)!$Px-"'sys:";l2%R4\r^poɟPz}~ ,ݗxjk-&{:t8 BaH.kL1Km8( cXӜKBXK*Ca_!maF*eݼD`Z:RNKNl%קNS DmN vYdubJr$1&.ϲ,H {@_ 8^2(SPW9[e3)Ɍ"}nOV^Hԫ yFR8..64~F0eF阉ҩlڎ."eL˵WVT¼,\{ޑ7F겆N2,t]Kra6I|j fX-Jfr[2+˪sRJ1:5yNg_qCkޒuZQDG9D0[Ud4+8"f/Gyй!GUOhLgK6-+2WAv~|ؘF>(f.9{z \[esLJr|b=8<7>-h%݉!ր;kx-Hwqs"xל9gtIi:A]˜Լs5,g#NhyA!N""+*|[$b9ќ!/JDjc߱ HNjo!|##4#j+L$3yUA,UxR9>: ^2(U]['B^(G-˱LH!˒Ppfu_9#~_oA µ$*Y zg+ǑQ~xsZgRduC#֙i[HOh#J >T*T*`sŗ|>b^)P#/mܕ(n"7i42` pR r=c% ,$倡!K ՕqdT Hg! b4.EXpOhh8 G HY!O 洼.Ttttk 21֑qro50ԴbP\oT_df`N퐣3ϣ cGˀFC$=əeuJ''£{o ;.ʲ[[x֘S+Gl 2]:#A/JF8Fd.SH+#n1kȔ;X,?d U]ŖⶽAxΜKN`PL`qT`]2gNG :w%7zc3vN6 X[71[꒸.[˃Â\ԑa\rڰ⚕(*lF?pkh ,/O _i({b. GPQbpN6Lb+ Cc+cxapw0JLGnc/yϸ]lM퀞3tpWuy^MgC&`c&m3c9b&sΨm)1}y Τe^[^|]StPDhq{M3lavPtQV 6G >ҧa]՘ץA4pSMKgC}⛞KY8\|ݰgeZ^ ܢaEK,r@4ToPo0Oh;:xMYe 8TvB4p/Hާ-7Fnm_ =mt 33wjJ'1$a=Lʐ&ǜ]5#gxm02Y>Ax,iv0TiWt.*Wf\hl"F=^ETmZĕ[7pvy4p$0q]pkaR9$!2qiY+,3h]yQZZeU5eKf~W:3j Sx+p}!Tv VY\R15LAGD* l?O"3#`o i= Ӎ!^f@p!5O?XQkH]llY#4rIЏ Nz]J"_' _Bњ]Cc ~ևv>A vK]d ?v6 oߘ]w6$b`Ńɝ{7E:"%nA# kSsטB8&)&G-u5QXv< VP}Q|ɺ~__pڝ3CJ&D-C*n-yJ/n#7`LBA77nI1갚s_<?7{1 $f:#>| [/aowɼI#ub%c >"bᑡ<w>d-ukR԰-]1N X5L65ckIX%Flb{qAq ҽ~ãY92L=1̩.f3O&tA]ZE7A"8r3A2W4 &@Prؠ1){w<6t"-a9L S R )3轼D=uG]6~b]ԈYxJ$ V%_.eRatvR2J82mQՂYs <[-XMnkz{v4rjLu=︈ 0bC*uŹ}E!,R1Z8z,..+^'ۧ`4vAW+bR\nJm?xvOksz',pE2P5dR_b*0>/6 BuVј9`3_ww8mˋ@ 7r}Ԅh >ҞKEʷ f͂c 3Ij)q镲1mv9U|I{Ȧ"WG'!CW %MlBVfU0fyŶ%Qv+Xr}or kM3ⳎB HIs_\; 4sg, k`22\^*ɎnH^t}h\:m<_0Y3X"!'皿Ϻ?mcJЎ{ Ts0@dRAwQ}X,1LJ0.(ޚ2̹}“|\/`b#:ZlL`,tFkHՕ W=$vJ2qOĴS:mžA02O=$7<[9X/#=W\xv&I 0t-hHtUJRh+'Qi6VzbZ!Fm`%nL?)ZF`iЋJjgGXca_ b|ԏX %fD"l$e6(_pհmxItI~&#^ѧRqF3\"MF" " }M&a.Y9sPS)m;rdJij9[V#:/A-7M6B_zBi pW8%NX(\@ێf6@B64-iY0oZ:Tl-4I޶,@ &3eh.95#+^YWѮYX;iwlJ^@ aEuS0JtBsKRϥ$J3Q1HfzʱAGc * xs:21d1Xpx!co:\s ya^4Jop&R 4P=@j!TYĆb2 $3C姫<%g}+mC51M\&$AY"Bhp6*V ? ty=dV&>k 2(GC0/ 3ε' 1,m+Vh\E{ܷk~G)Ij^|Q}wwO{:emK]4<ϸgz+&R|0b60Klbaن]qyM`ʰ]b13,SRh{]{MC=] .UɬCsM:ԷN.3mf5a*!!7x4&r"r״tl9;c[Lg39x--p]pS&+(&Q93ǧmYXs*ɫta\?rP*֖lӂ\ى,CDSK ϟultYٮ k2cLWqse+ޗeoebK^YփX-褤տ/0d2'J>#_rp(/9S2o˃Bgs88iłp9-݆.u c3 %-eH:D*S1meo+CC-l./d‬rL5R3)`u:~BަtI sbq>K\'[P&ch,G3INedxNKNNYC:ےtkpxy 2|CR(X ,8OhxSԮ$JoBk½9 Z 5Ylr;:+-#,UA1K~D :RX+1TDZ\JRsCQ6 93^S5S@ aa?΁Z)0\. nɄǺFq$ښM:VDˤ_x>Uz핟aaq &ؕz[-}=XpXdaUf]z.".rRQMK=N\ǡԼSsg<9eI7HCdBTc,r%ğq_SYq&\ W4oF#( fdБ%gmWx\IQC^yp֚O@䈛󽏟Ω@kLGq9Gтk%{azb̐Ly!2uq]`rEp|(>%tdH9Sn]B` %j[SgFL(ĨYNXbE_ML1. 4LJL<5CN̥Rp@q꡻7OhG4Q*|N['ԽD().O?"9VT+KЬ nha`2B9XOiP ?2ڀ QSA)DLœwv+$ԄF8|yeWoMv̐m`"LnR_v_0(jޗ)}N' ysȜ=cSd+„?eFrDHiޭ]|~ΜKv(&*&]1թRaQoPrҨVBDFv@2\bn_ C~P_e&ޤЂ1-X1BF̨8F=.}E A4?Q-s;:k(n no! 8ogyg<#޶[5W!X`yLYpoa]L"|㳇5Eiٿ]`4uXT|;> <  i͢+xu?ㆯ Rp℻nuYG1s`xl(I%t!Ev^r=z_N iÀR\M:t!V]eMb2P2A.G٫#1)K)a!tCDNG"iH Za[! AGAU{2" ZNWz@(!}nیYcN˱^WauS/i-OԁV F|b+Ԛ>: &\GWe4 N݅vBWQ3D4怓~H.]EA{YP\Cͫb6:so/Zf*JC"\G'̭1 sKR8N5C3`YU,HC0[D '`amÞcp} G_łA$֔/lLδWO<M|Zb_9yr|iCr>'^QPU:-#N;Bg&?5oy24v)#&z)X;去k&6UCpC*;(e剤,x(iێu^ds?gu}.>%?'fqxr-:I(%9MxTP ;0*,47Ct1㳿?:bnhl_ MQtg5.tH;˯:xu`4MaeO7&dRۯ/>Fр݉ǜu)kӊuq&N9 ?!Y5$cI3O[씇|~A`QV Bgtp`=/).R>rwa||kE*V&Zf'8w'5 F a aSG#_]koDaS\s#ųQUL}B}Q0{Ycuki,Y'G5u|n6zTwUŷťYk!:.jn*k]KDF?[k $XWdxvgM6OƼ cMU(`d~dI3qPJު+[NwO 0Ђ Hvf|>$5 Lcz}u!=25G\uv浊8;wy ΁ÕOS"Tq9קTPDC~>Otx[}JZ8u;a1BW 885=6)'~ņW.,9v&{ Ru*N N-%hH _WܬĊ Nvey8cb!]!Dˁ=b*eyu7sK TX=<'Ș&dhj'N3 m<Ӝ54fNDw% j0Ldf`ڵ%pfAN*2Vb,sj Bi$ Rp} 4$#{u+f,W]K QJNYK$8MâaƾX+>~U~}c_7 +o.8؋bZ$ϥTsǫuwW|nQ0ݯY 1k}ф)(&#Uӏ&N fBftk5VA8;a65?nˀ"-N[pGѻK ~8 ᄞR$Gvck0g!LI8zKILrHgoT/`"G8LX%hTEro`tb.9{ةcx hX|0F(ALR 70{L s@K9]`MlQ~h'gƁ;h憂ǠW=:*{5Հ|WLh ԽbBoer2Q lkxTMKw͌zj`3^!'\ S)nݽ &|2t` ldU[ *ޯxE|L j75c ˝#nY:_+ׯdUΔӳO@ qa)T|<, J|Ž-=;˯?iɂ`=B&_lXgygK3%Ar)U/`u%$>"l57 ygK1/ӝq''p=ڤmup}9`8f$esI=I}5TrٮyTI4 5E(gJj/ۿR@nvjVc^Ú|fD")ck<sonz,gdOE13w=suC qDh)4e7XuGʚN+8'sL:Iy~2H'­ޗ\(86m96*42g }5-oؓcXaudv樖a&r{k211xq:{~ݔiDҎ"X~~ O$Bx4K8RL*O_\Zڔ\ ux1Б$HMr:B >34ZT)"IS(D֜uXrM`;[* x,Ĕ  !S 3;@M4-U ҜPĂ5#Ůz7sJJZϖMp}j2`T JӚ}SR)0IPsڥa LXNO& K0yMrf-pZX^f\s~U]C`}B찒N8{FѴ h@b|a3cSA vcXvФtDzl }hG  (.] {Pmص_T÷wjoJn{}Ad-P`|CufA˔2xh2X5W"@" E(zē`9)I~B !{O h'?oN|Xwjq: Ҙ2TѠ8pȢQ qmQ,liߞ{" :Y:IP@)G$6jvHa90'؂`0 $G$ ibb,Cw5:uTfc:" 4:;A%Ahp:5+l7D-+lP#R,yvnJ"h} z Q^]a $2*;{%2;K@,`x@F筟̒\`0Úݍ's͊KJ䳴5< 7(;fVP i"z&l~&-S M's a= yثAB#h"x'r _aHcg[ђԋYQz4({0i4gg#Dv@mzL)bZRT v\o?lC'>E#N=S5dcC f[id % ܹ'9"4Ҭ 'Ŭ7L>w9p(.g0=cȜH2k(#oYtt 42DEPzZm (Gdz8CZg6u+d܆Id24D<)nﰳY{u >r1xLTʬ:UPceC ,9r:,"Lbڞ9[yÝTD6 G½;|XBX(q ISr="@P3A X7imGԓ!R+4b[vu9L925&[[WV//(QC\t$=: v\3^"'g#_7K5 G ;';pA fcP8|t]O\1P\nF5Md e*~B΄<]7ɴ@Q>rP#RooV[;*z>-C0qCs9nj+H@>YO>"h=ڿg{ iwkL`Z7w/P}'X\T0̛B2!v6b\aDF47rD7ٴ vg`˚$lM{EQ$ƬKÌR*pNWz,G%g=-x2UrӒVe̽%so6HLAGI!9d<`y!_-a5Txa2hTf3E?4 f a@`SbcC05W7/+3eI̭E1 %== {xSnASfiTXs^Yr cjGàbh8IԬLorւQO_0ƲHĀ(]ى %ʄf< ~" kы# 4Q8ؼh;0leM9`&/OM$BhMN'됹5˜Qfh?zߚEAh—vKE&DJ"_k T{<\pAm$dkhYsBL sބ47"\pAGCl$9+&^+y Li%$xm tCsDAHjaI+ҧi0.{i|I$#+p\3mӽ'(9|[r-9{M-ʨZWc+Aذ]g(hԲ~L T90Sm[``- 46h@V}bX3 O!QoA G`cT2l^#%hCTHMnȦ _L#\ ID/7簺152{u˳XAop1eyB9a ۂ^h/#zj1ttMu7yqlM㛿EB, ,! (!NRhpIKbPXkbn||%JO#8TǺOYO&kn䷿Gu{ÝC_ps'']!EGkh*0QM@Mc+W/")YZPkչ_nx$Ub֚[7$4hgjk^/=_}a~90:=GWs<s2}DY`bʽ.Hp[}@*~i 9N|A8p_lM8ûKbs@4l3>K*fa)hLHz¬nk JAfDKlG48{gX xc)Hro Cɏ 13@T5[sj *cF<:R >wkTVgzdS􀶼&(1U8n3epc='psS 0=jydTK,33ٶ$>:6% =.ي%x>#D*4D~uP<⮻%zC'J 'Ldt&,|㊻w :n(*WT Oud^0jKdKV#BpuKC"HA _^Y&l*W_Fu=cC-q5#GCJr*J, lު)Ng9CmԺJz=,?9JX>C1; Pt@Etq)u?Gbo~M_4dZӈ(tK@ktfoW?27 4`f`+!Rpb=yJ"\wxߟ@ LUa94 UMNXz峁{%|xZ`/<1&7abT-bBˊz]! 1Oa±((Cܡ4e6:.%QnJFDrRن@O!NZagM%M>ꮄ>FII`DudڕoSz &`A#4cHS4P=tv6Dgܵo}%Z:< ]C6nn犞ZB{JfiROY`c4r-s1C^_ SyPa'`z)QP3HFYR|pQaJdTpmf%PAh@= {Yr_>;֓b(8~iU`Mw?ҞJJ~!IfvT7w>߱ gHs\z QvLq݅b.QPxДW~4 HƸtTQ;_(n*Uh33A|\U0,ӕ2Rab;5UxC|~ٔ3}X-rƝzLfh vY%Caha(}NFN֨L>ó ~S}Tw3B;= K=wm K_]77a N ,9n8f_u,>8e#pw n^6_};8gmVVgae*6a1{,cUɡʨHa58{D+hY(mT-sx{cXDFn9;M<*D(0 vG%KEF5**X-8{2$-fyۚ<1>k-\w8f~ 9mnsVy{@60VHC`|CV8_v0k- i\5_0? 0_?`0K9oK.<~w:o+ks()O!:c.Ǡ`ry}O){Û}}1- 2P^_{d`:"IM٨l 憂͘'аzyB)0T107B<{o~zP( p}FPQPx ?.ppƅq(@sfb#:@-c2([ɱ8#T{2yQ0;.TX1 )wCVKnx{KSI1 1 3·&ރn&rk禯`]b 6It4洿\cou,+|L#3c3&7-bWpY`^M?IӴ;T Mb7pD>lHBLwFkxM~P*Y".C@T:wl>(>x EC|y]!E)Z3e')HW_Ξ}x,8 D/Dq9lD5*56~;k?ݖ%?犤q\-.݈l sMd>Sws$Ȗ >tE.E'16f4>&W ^dzOm͌!0`w Q`J>3@ n`xGeA;4t&trM2zVRig`tǂ(4\N`Cvº\FЗ`CM)G(j yPښ6$` 1yޔqc*xX%D%V[Mt&1 |%k@{w+y۳ihn5* g1ad$׭scXPr4bx6fN#7idd C@-U(TaNYGy_DA9`DacO)f@MLd ұ,wLo5 2+Kx PrZtY5ؑ-7AESU cclosiEYW7z*7T2l*zK`A3Ew R NA,eQc xJj(/\ mć'!S#XRt"p&uPePD\\"!qW؎^$:xS؋\T^ e'?p@eH(*#?4{C[VXD|GG%A[f Qdp7{|pt2#a ostxx{K>zrcUlY0vt[|4-4DQ?H٢/kVb؅$yٸ!FC}󹏲Ms-b8foeÊ wc^*B,%Ot`맴QM.Cf[9P\Fqcwd}b x'X8("{%܏Ad͆%4\Rnj1-KV|dܓ(V@AL/:|IT)Elpj(y|_bۜw)t@ _- j=dG59n'v u$H?*"%FR'k>9܉'X2z~Prf[^AxUO3ٻrX&7wRN("g" zӷ E0U .b뱨\ _n6x%."VOAז[}թЩzmxqPre5>k.]QQ#1{z:IۀU 33 &*A F1L씉IiVR-HnkT0d TZ站8 C%1*l?מ1ISkTJz ?X_1پ Q\`CEE TJdz7F T Ri&e6)*6`bMD*3 S(I\>F LBIKvL(\+:z:4(·vRզ`#k *)<4TP)x6FJvBR-黂O0$CȭT5\G^ XcqM7Ish[ ` fv:vy}D<2,qorG??չbV)T@SpftAܸ|%;^MQj.e H _cb˭xV h/5/ ~"k3,`G^=!|K? ݷgog T=>Vwwn6έ `A/qkMmG4}KN+P77{o$Ir=μʬ`%$ge>Q4`g0t7Ȭ<#pw3S>yd0%QyYDx~G_/Xm9Mؒ<8BW.)$yg'5=ZϚp`DͰm˟ߧ3aa.3\.B"MjsKHSW3 l!.1Rʍ;cvKJ?kA'L]^_I\Vx ѣ\zD 1g'39`4z_[()KYK%M3 Ԯ4x3 >^c^ 'AK[wofUI']r̡b22C{7dV(Q,H.s?NjCcܠŒGx3=ϐ%2>yBxhw)c'yԲBTh"Ԫ$"qœ2M%>2nXzzWh~̶p)Ml?N:ccKZjjMz3,?Jz!,)L%k;vg${"=wm稪,pq1}JW`V(MVU8[2d$DllЋ'4 U%S35\<== (SdZ&d+|r RAX3+ _&r[5\Dw}·x#aQEqc p-.e[0rd97L:tldXtgaӼ HaWD@<s p]~ײlH@˂W%/+m!n ƾ=Df=\EiTb]6)(~&9L͑Ed20˿Fࢣ'C*,VܵȅLɭPHI>^wצG[JeL[h; q7 f_[ʺFSt)+%<Cnt)ռKc-`bҦR SlzBx0l"^!1r8^a"J53h~L.֠S(#t@{)&-Z"4:E5(M\9Zbew8)]yk'#H"-9CjA߬\ꕰHd @T*N-_2!$.?jusbXڹg.xb˒YM[:5+dMTuɵ N ؘE2 =C_៷4_Mi>?%'|^s-\_?PyƸ.RӕNA{`T[k!o?ڂHA~ ^Z#66b9%[ @"=&1fY,֔yqUm/.#;%[rRQ#7 vzℛ.Ir@Ő3X"klg&̗݃)t7}]1Cr4Pjɮo)M7Ę6[nQXwԾfhLJ}c# >CBU[=>[+-ᅬ_Xe%xtToS%v 05fҟ^=Tc9f1&_TR`6CW wh迿 ơ'JLٌڷ/SyE˗\p45P a@kHZ(xڌӊئɫBҍ.d Ze)L6̬DSqHiV ZX @ŽY7#2]zdk C wKϥJ]6+O٥ѽ^ԕAG]9rTyh.jVVLԶ,qj+˼2 Z~Z&p<3!gϩ_ 8#]hX3h Vu6]lG1KH9#y?{ 5d]={/pal z}I[DbuvĐ@EzRNu~&I$HH_,2MOE9MJ8 )#O5v: \Z^ y8z{e)9睟|=G~_~Y4j#WhDQGo1j;-zQpY@ l]?~i>X}h Dhbsi|#MU &DRkfG@6NC.i9bgX |ݜ/3nû t0t8-y(iJS8T,Y 9L^W 7GAevqbB.Ịy-Ρ(f݂ ;+c/I2鿝b%sōҎ*14шFAGښK$V۟`;Bec4hiQn#dt֨Ӫm'hfG,Wk-1, čq;n»;[æ6Bo(~gop,mvEIcjP ߧ8mhOx_pD隸kodvZF_][k\zp&f{O Nl/~ĦlpXX<3!68R^h [~10Y zIe[h]Kaܒ.CB01[v6xrcu lӓZAdWA hJLDfLՌX c , _0T/^BT;f5yOV_7pjt{&gwb&9L @<6~\DS0)z %L-KhG4=&eI]Fw TncnݦVF;DS Yu&5䲥 ǐ0M$ƈQhqG`[I &Vw9MN\sZ" X{4VxJ画—g|^Xݻ;-Bf˹٨!*|s,لҥ9inm,8{8}1F+/3;o8LOz,-RXzYv0 /L8XbMsy6߹= eQS1b?f}O$[m m>>i#ɻ;t]V0՘FVP}rʐ !#ZSBaОS!q!F^ovaRNGsKkMz ;XP0;o`]aAlt*Iyž~Vq,>X$M8V,D^Kl v>i>9F {f%S#fAxyz([5mJaU lγ岭A:j0bl`.^)jΒe2p,kW}ᖯm阁cXՂ ׎ bތ}Lh/pۣztBV)o÷O!Zjb}7?}hYQ $qD2%vy&ޟ }wlj'7" zſ@6D(E6M9Ub捀eKK!I\K Vթ`uvyQkBtm] b֨ HYjf?Uoo-P q{ jL%R/~r eɐh"^jҹ*KHH1 cSp)Ic}}t'%xgHǵRKz[]݇T~%v9욯~xW+㍣-r$in6`f,oPk'hI/Ǖ9\/ ̰ɤ$QfJ0i8 Dlw{ ' Kk5%/)fU-kl;(}Hk\%3zqȾ{&W;l͜) /X&4̙넉Sǚ G)# h4Nuz4,xB--^ JnrUO^#O!vN4C> >) O*zׄ!K|B*74KE(Pk:y]NȰ5\~~˂NA]r'fD(6K=Ui@ EŖIYeSౠ,(H<cئ'#"#b΃FxYp ;f *2!!fHJY%`F3Es;@jHpdMP0]|6䆚kҵ]!CǂFiirڥ5aL6Ih b>эլz%̥D$iNUNh~9=$E]KD vί۟s1--xI1 H7gԡϘ^]+M2xpEg`+Vd+S|:8:0rP {Hp9NM7#*6u±9*f/# c/V |HXP`mSQ%صv$GDg8VF;5̈́ ΙQ2d}#Lc.kf5mxFpK~ʞ٥ 6~3ȟg=7&ک*}Vig&36T˰-E <66Q1XMѧ "rJqi)Bh'ky-JŁ:$8x96\PP"z#uz.Xgj) >0K ;bLV=qfi_kcd.LZEG%p23M5֤BDbd])xgN`F,(d-^bcV4(87EFĺ %9|)Mz}5_ъ 09 Q@a`$.''fT@quLNfė,47Nb hQ*h݇"R4 EYB( gyǘ0Czԯ<:{i{ W(Kg#,[ٞҞ><cG<Ķzۏ:1_G\`nW AvbčGP [>F뎳阶$ڸxzUiИH 4pv鈭rVi?q81@eìqc1%^~9drMV|?0:WeDQmؖ}5|(~iN5E]CNxFKK)]EsF rIP YbĀ^VnJ{&l%Q=Nѣ&PT)zԿ yS7JYb70ئ¬ԔR)J9ikk+Pqw"bIBbF:CuZe X'Ɣ/4k4ϟ_0!kuE:ŸHQLix%7H5BM*gNiCp3bmT\[iq8OUnCEwIxBS*l| By2?my@U]6Cȑ<`7|.i;xþ%IItbv3܍ʙ1LB8y dSG,EiD*M)(CU`_oQ?9 b z*$!\<}{)P?^WRsP5KgeIi_(L'Ii. ".0yE8V𘇎;S~ˏ#DXRa__^RUw.,тZ]K?Pk=^7DpRyQ;m2iDė_XnBS〝_p8s1GoZF5Qu;x3<-0a2`GP0,;clQѣb ]Ɓ"$%  O<.퐊1kR_E-]u2mZg@Ì p hɍQDr/hҧgp-XDԿ|L}q=*nFRmp4}GG33\Mܠ) *IQ؇ç,.05qc #Ѹ6 1F|T5C{xl Gij h394H;Ob+xI\pT7)olRSS:;;{xv2 f줢-D Cw xHdJ#-ncY!̆SXA"E<8  qH2>4(} JU*S3fqP|MJk0kO1clDs:g6[w+JAfXBSPCtKSr&vEL,x 5.$.^X~1ޕl`XL=˕Xgp5=\ 4&(kIxN2@SP.Z̈ CGlsB8y)um<oKXψgYn顦(`Yud)Q,s'B3 N{oVA|z@w y /eȤHV9cezDaD%fLk^Pl)pקψVPKF xXm 6r0a |cF=aٚM)| 6Y j >>axMÂ) *$h7 e(y%Y5NfRY oUJ:!=F[nchPHi\bG{л%v}XQriC[([nSQ*<}ztzKYh\D`Hu__¼pȧUv(%"V] gnofKFei2%< ۀ݊[o @}XCu ulMUvqIOdG눌jيۨ|"5f!:( 7Gf)N/ A¶e&8IsE&̠avDQGb=K&uhWHdgȥrxwcYE{eQ({梛h"*0].^u KeRI{WRmSl0@hTwmr_?znbK"m)ƕDvl)3&/ǯ*a+4Mm@ۿ+·dV$Z Լax cv 0u]҄DBG8]+Bhg}Ph ((lĤbUuCSo(\3(lG|,_,EiI,8zb89KW1inwTrq g_]$y NAlvW \#A'Rދ(#d=}Ln2Ng݂Ė0ƛjFǹ墹>Fb V)iXIR]!rbU 4v ]#%EJ4F46on Bk)';_>fQ(@^„2FcPp7KjZ(vC(6iHl\E pv԰8$PEnT+3fh>Eʆ8&C~~iH҂:&)H[nٮTf ]9F˖eNUj#J=4~MSNX3&VlmTsץN+Չ * 8.Uʊ 8hx.(("M*]AA<2?}E۴#,Jπ8f=_4oӌAUrnfBϭYL%mG,lɽ5 8·}5;CbvYQWJf8d\1:.BaaQI-)5NLxEhISEA?m,X\3qΓFQiq>CрʪƍӓؿQ0:-WGԋ#q9U՛2>]\Rɰ\p_ܠEJY^R e`>mS]= ur(%BHYOt:jn6z LlxEASe,k]MN:]I, ňhFBZ]GX"kʛ;FT1o3W%Rz5؍D ؈ݰp7PP;fQ`fPy6~XriVR({b Vq} xo9sTrTge!VfS;IBWb$w%Qhd%1`)إAVʄ+Ѻl.džlO!)hZOߧ9Oj:e($xiVKʿ@WYnI8Xa:|*^ 7.;Ѹ7"tHPʔXl䞳1ĤnL,Lg4J@4,Š-RAMP6չɼ%GIGOnZj8XZOdS߳_t" \io8/fˈaLO֜GZg\dt^œ*pxjeVHU]$} ֆIv'5a>I,,OqI_"2qPyW8VM[zpևܜ%n.^YºhRwW-%CtoM5ҵd["kW@\-?o~cXfQzdGc2#mq?'2s TBA(tҙn7 #0r^{";Yn~X0X ͕]uaJ+-Wp)a dRК^h1Z4&GW!HD.GɁ&oJŌuMҼ Wj`zjm}7„na_PGK O4ӳCRel9*D3iEՑ2C @gݜ7pO~팔l@DT41MBflrԛ rSr:DMS[ ,k"8XhISh.ObfAson~K,̖qe'#S1lQQ8R@-STKO#cSih2Ғ{{5 ׈-oc3G P6ض[qX%ZKO 8p3NK+t4FhْOGqJMrK]A$q賝.;:W[f~N32bC%A97(' ?֧szM03Z]b~i֎F^Z7_XU1i7IA/Q$A/j7 xjrVUV=6x\Q؅cjc}E8?=ěLjƶ a _>F1?FP2ԧ+rU}lh6 ܔ{K2aMETh,MJBSUjTV{՗O:HtHU F GEQ;k@Y!G/ $Y&K"\#1tc^AT !f`.sAh8Ff eVr%՛7ح tD8:-،l}|KЦ" S:G0 |/֮5MVb䷠tHC^&W^db:'sI))ܐx{k} P0N0'H\L\m5L_{惰 ;RWr2<$w3 μFR0q̙ڝr @0qP ay⭣K., dP;+ ԺrmUd&(=XѪR0SpybUHTmiգ05sMp S}a t19BH`dBJX˯?M.`3WٸɣpB6x0l6WH9[ZsP TK(N3 E69~@p`&PwZ,k/ՊhUX,0j> o|I[b6 dvl\K-Вy\9-+y.[su`p׫Wb_| F-,Э0z"l3fǸAm5 0d`H3}Mu3 !ZoYUDQqK#.b+S2ZC#psO>Q L`z됫s3ψ-=P}y IDAT?ǖ_[`˧-FF8bG2dk+PK>2)j빉eYG.Z;T^P|x5";C'Jf~Ělpϵ*/F87[o[Iak-w^4!,=\m6]^~sZHgX_ZP_B f>:+^ vۘv`,1#`aR`h2L"66b%D ;yEq1'#2$WJGu5'Ϡ( @.NZyrT'_߼wNt&A $r)Y-5XǨ#x }.+K+dt ){ -%#AK+k"j?<}QC< T*kD^)>lՙ{[/mdDbMD^фTƹ!=]QC(ΞC7KX5ҡ/[w%6z6G̞Z8p.LI%ū0:vIWV t?B0hj媹߀aN<"^ jE7mz׈Sܛ?L -ʈ텇(b+Kt|?}LK.`wN-EFI6WI;pnf^Je|]7kU+k9: TŐtwl=:YtGb'SXh\ ŅRdx2WB==W@.`|N VYnO8F:LЖT|@-lRڊp3qQX..`ٳ{fnv=2Pڡ˖\P|ae4qFņDqռ D!Y%M b!ahRZ}zSBo&uHhLnA=dE|KBqiwhhip9 &NV]_Gn!Lf)KܨYYH;&(6tSpfNY" Ԍ#ܔyb>ѐ#2"qী$[nMWm)9$MҒ@΀Yů)69!TQ 7fu 'OKB[kH2[ޜ:ZJe2&CXߐX $DZ T^:Wy/Tp5VŶ  M !Y1PΈVaxDM@6 C ѤiaVʢ#~Ɛ0= ҈Z)1jHuQ7kUX`+=͡Ajzdk g\d+-J U I,nWzhkWfvLa H|;\g7ƯW 3vkDG䥼L8p#rm['s 1E7^3:w*&RbY 9y0]|uEp[ovu.Ѻ{@ M&}riFijL7' 4_Kb5nOxuhLpvM|V!ꋊtNN@`P9| bXFX8E`9?.1>Y\ *b*9>e,Cmi02ߡ;hƊ*fp@y t Cn(a<(&H'I\C? EnimnҥrMcl#]3Ma!  3f V $XMfƴ?gowH<\>P +Eh&&L?9'ia g(xLΐ$`mk5eǞweݻ!m;BP՜cSra+O1'w{>G5'zu"u6&HlQAGC' %b,65.Rq/{3 b6mBr+(tz^ :Rnfw!AVb8LHdS.ؐzaŶY=(hxsauݟb~;|Fut)tRG(U|'R>9bVߡ}JR VR|!z* N% !MHo ߻-|NT%5T1I<Rp1)/h!ţ7oH v ~ RBTqB@~ZW-vK:2>ft6Pu'|Q|Τ%%vڷ:Iv8D Sc1SRԄb|nB?iQnЯz2Z!Ru MpR,&s.^=Qr sC-6oOn}|c0ݛto39uۨ@fT&C~[B5(%V zFSB6}}lTf˵J/gdۏE@ M,4McYpXaA_Mb@)G^el>-l>h% ?1?yaYFy6y(BEN\:dY% -;D)}ZeGQPh\Djvm):Ԑ]1}RE1^M tP% "m 6Eg`2d? IkQEEꟽ{}lo Z/5849P4 Q/h)MO izѥXUE)<2Bؕ8׾mWm&`1lefx̸֥{cVƐzh*d(kSNcf/"HmR~En:cAWeX0q)R,B8;.)je0i}(, ɸZ ѫ7hU.L2d3Cmp0u-s1(/_ C!h* V,Ez" ț_6Z1l?H]O^ͼ@yըx̿[.KS!NIolGkT%)5ŠCnE 2ېZw-\Lq9TVY ^O=a2%sxu_~uJpg =Ӯ٭k_m%fLK&usܙ!h.ven԰5hK]Ǫrn&%9;`˱^m$1ҦbDkFmCRįUMH1 1W )#8Y$؈G8S6DQPjN,A &c4/% ލQ-kBFX|u_=GD,jۑe{3M{uRmRa.'u [VYc֒t׎069B4s(Ts˟?-*+4'H`{@$PZwʖ$ ml;l췗# mqʮuYi[QF i:%Sݒ͟T鷟+?#&4X_T@*Q *R2R" %5N.qF6W3SL+\. o F^XBT{ ܺCtF֮ۊtCGUY2iEH^Xr&?롚P:Kym1F`q!K0sSX>k}}$~7m830!&,qU,9XMF,dPIP08ԤbK=E&,2˩^kc7o]7š!]ZK9ZI)1+.jP|7j˥Ӫl+PȔD"Ks_JT*eSʼnRcFgj*BLԨΑLpǘ)0clѕ^qC?`̘mё(X VDDa8`U6Jg4T˘p@WW?^*1Q(= dmc|R?^9rpB`7CcüU9gosʮK~mFGkOLK4?_UjX:t)y,\W-,ŜRjxC-~\o*H1tqDCALzDZ([n'cvs v鮏܀91dAeQ?HI8gk@>qH1nDwoxZB5+>(%n Զ򒍯#P39;I5Id,ŝ*P RBa3B7ۘy76kK7m =&xFH]z@hN50}\jYLOvwیď0Qi+2פD_pam':T T <|UĔBhJ&`:#4XFoJN!?Cg+lln`/q#6f!&N0r5xLwHRҽ5Bӏ;QzO < y.-V Ts>%絺wY*4")%le'.>3`O Yɿ/[0;ue9[ 3PTD2my͌k'%d7%S/ꄂ cF:$X5QKq8*f52J ozxvBJJZw.R3BR-J? :3*uu`Ș񽑬GrTvWK[zrԑ1=<]2Kp[d@/,_5, b6IP2iU2`S$siP8"=;TE7r=ǭ ~tk5Ԉ'xEKX pɘkM>s}l* u¨8:e벱 )!M+lϊ?W/,J}\r<x8? ոJ"j)ʫiƺfX-b{,*qeapajZjHABu)"; 0?ﭺ _,X(d2"̎K_>ip~Xc+FĥTA4T1WS|5IU0o% d/NN6i/cxR5hg{[覍Q%e =eA$ĒS0 "CqZpp}Ik.E`,o^|gB)X#SoW n$Xԧ"GPŲQQ񚊂7YgJwE` AK[1dX鄼J8 nhs7Kgaʈ2. eQNru(uJU1j>Nrpc7sagGlH4\Où)KJVV2D8%a\?pԧpMFBzc sPaՑ̏N`2*FMVIUE|NHz}dHl`+OZcKoøud>UH#-fT'OPf+>ZBeq4R®t mE"WA{Xih 制#rߢ`WpXH-srpTϑuzi$Ľx1Wm.꿛w 85@7_T|w>diZ?,ʪ A3}3c|kmS3P aHhsz8M eѰ WMT1_e^ђ IDAT8pLJiбI9IS#"AK[[;wK%b`>F$z)a/^cUA= iCׄ Y;gfC)vßS舩|\~Y:Q2i N7qscnd7'4ɂ5ZuiۛLdc}[$ ˤjv]vy1ofIq }]y'A= 1_!dQPjo4`{e:JUhjR|9żGA*]  w0 ӋOmP-fp@V;j%b+gMFba{yNt 6wq{|I9=a k_)g(,;[3*՜,6d<[3R|]-ܼgjM3>dNYvuK5y.Im-`4BxWܗf%0$fsDWkqTUme_pCq|4/9wsBB|TceT{l}#h-m!!mGNy\BH09?Mlവ`LG(`?zVc+7IwE:[8۪C2sS/~!@fR\+)_`x0[(ߡ#R꜋#}⍒g%8f<'R>J[c+wuBJQaر0xD X^@poe}Sys*0 |0NHMFE L^#(\a`iR,1ke'LbKH#H Ӕww)0K!~i9f^z D e1!0F g0 0#1d S~ݤ*[uޔ[j~9<RV辳O{WVfx3Dži$SUe1XBm BMQ]~~Z՛ zo%eV纺V:}ϠůC4FkOM<}R"HQkEr<yҴIf_g &|,0+`:_N:#Y]d\a+he)=~xḺGtF{ nb;wplƆ0bw lBR:q+|XĻ lz&CQ oo!\xto288ID *{CkT2?@P~;dB/{kkx#k`<~5۸?Ň CNK$]_BWLqw!M{y ߃zbجV:P;~.?PN;ppUm|]R4@"ҪbU9L0 yt$rbio2hu@H-YI{TU_%8'hUy]󑗝]B!u(?ߵϹ7_lJݜ+&dJF\ uQ#U9<*]|e{)Ly9/^xjmkF!sXTf㈭M k+UJ27kdtC d) T9;*1aґBeI,WjbW%sl;z_i@({i]D+kbȬ*$'[}K6(mOn=4GZۄ ( E`V8k Wo ᦘ&OnQ~*x̒'у {y]j le ZX1v$`JKLuB1Q^~5R.L@ȼхamCc(#uՇY7vI|+(HpVP%Q[?12cO[c%i֫u~h~nRyÖhͤ6X&WǨ[|ˆ.b~ž9 \3IeO1=V$3r.-]^e~$I4gyYYUY]==c1c 'IĂ/$bNOOWuOWgVVޙq{*|P3s2ۀǡn'|x [5\pqfFP@=UfC Nf:]*'ޠ^oP_1"YHZEZQG.SJPi?4]# S[ Tk*쀫߃/?%]`Mf$5 >|Ou˩rRz@nXd<ȲAb(>&A0|5X_1MZ<*9:WԂ &A}҉t{ҕn>i55jИ/0;Q0&$8h ƻoFr˿ ak,̈́hG2,7w͊Fa-3M{hFHo9b5JZ1) 2?ńt`3kYecHl|팅T$EDpa*8OO 5-c̓(@> 5!A1Ze#w(q94CMnݡsnF,fJk.J}ΠmI.=XB̫Oٕsx. ,ljB/,:SS5uH9JѸh/ a0~`Hc{u-ZM cmpAjuҕt蔥K#w7Kly]qgEP%R9.@J%.ʧXw d+H'jS8"F{3gW/%sV`f\Ba{=~:a0h+a70HRtLf`(81gMc(*e^+Uix բnZm,q0PXD61134-iyg nmVY\4]NLjEXk]v`O&8xlXM0!%'ÈA&0 f@2Fugs2z|NF%k\3iN{H_ ۱x ̙k6f5RGa$>c[yGijj5Ym!;s|1{^t*xH,QIY{&v8mzvD&q n A\c;@ *!*m\2 Lsı`ܳ:5O}K~s}XXt#`g@ -eM!qCW~7dCĶA MKni9'kq$p*#1=| ij3$hB][ WAl_6wG8rj [1%0!Fߥq}/ LņKH$MKkityam^S0IעPNA i#OJF1B%eb8"]Dж?o~mUm”wtD2HcڬӘ5 H@3YMI4i *aIK-b@g1NCk\@KK-/ət20xb:F˘Lg {Ln1"[zݹd3^\1Ko<ڗIgϹbЧx?z3=%sƄ>qL8Exή\`~D2İCh@ON0IV F^zE_4 fᮼW#h-1Y)R(PNp ϒ&я MکauCItqHEF\vYډ cAEBcӀe5ĺZSʊZ*~(Xp!Qj]/qUⵏ0it+A\@o|抄Hz ȷ./SxğP݆Q@3@e ,,URD:;@#Č ^3 &(և7xÍpF)A]JxmBnZh :ث{SURoGJ2HHb\UjU$GE |[8S9~f - M.HQ 8;z 3xt5 C48'c5S]p=-{]4tyo1׿eS!Քw]LXxrl ~)<c4mXJr5csW<5]M.bW$Ff9W6 d*YJ NSM`l3 uO:(y~} AGk = 2qŧAo/Q76 _x%-jjAG[8gx˭8Orz5BSւT<'Ggg3w#/MɂOV}1ٵvCJF% rARAR Ц jg+h!ZTS?wׄ:]P<"K$@3)g]HIPƷ׎\5ԍ, ?b\Fֿݪ<RlaHzMD. ScFw Ԗ{jdw 2aϽQuftiHc$ o:ÜdU \`L wF0V(;i7j|jg[g8,C$! \R(DA~WXΎ6ym̍e9*lA05Hr#N}sMmUuFEEET'8V"YFJ f,ZkY en G<9D:6fMQ37tAWe1xj<sB#8!#!'Aq(Z%.3bZΛ[x.ܢϵC>KE<9NA)%Oo9=O8|b:-ƨ;ECS<Wa^w^#WC>34;Y##1n !Q*$l"$kb4ժK$rNLSNN I*SM,?SqpQg?>'Ig؎ǵYKBc>ORLQ):}^(VS ԝ'hK@xꦠE[LxM BISt=5a (3[y3@*{;H\yxÍ d6z/?%? ʔYlhZcT }ž-?@0td~^x>@ۦEF;PԽ7Q|x1Ÿ oa 56 O @MHU"V9jUSg *t,¦Y粽Dp)yg,u& c| æs%̀>c|*Q|]Lj>LO)E3_j{{lA`f Rk !㦞)I'}1r?DX>E><݇SRW䋚t>'?~ T>5ZI*Gr{9RS=COZONV!'Qdz WYKnٿcHF\\(1gfZw%٠E]m U$emkMP$BAS=SS6zy - @)bkcɇ  9䠎FVͪgz!M^ ԕrL2TcsZab dbGP0 tYP;F7D9tDߎ*؎=ۋ;)Ykta) UcGkr7?_K8< : 4^@[)*VbRt=G0&UE) {i ~#ť5*DƦN~W0"c4ao*~cq"4piC Ei~ϰ?dο"#wduyƟfؿ?f^W\vWHCr/`2˔ 3Agiww1_`Α2TS:nc|>apȚ:Ky'}dgS ƌ mbCƎ՟PQ0Glr,iVy?oQjΪC6d5< IDATC凃<#,M=xv?.X b73{(z 17Auyߡ)T5'S .|@R˧?|;;%'O'ջ*ӽ(iY%X Ìsvp̸!߳7_Da'9SMEACA%|> ܹrJE&89: Ro!k"}N'_j pnG]jJ, 9o:%ˢDv;YK)r?)/M~SKqpvym~,hsSM`L5< a pToJp*>WWkjGHv\(!Jr iLf# ΝG%YzL8/69Ji“&cj;>BÛ>txņf~B;aqtTg/э+loFai6`?J\~$V.ظC,lazB30 ncYI]Mq+8|x7]7充%3p<)Meq(TyEZËǬ]6`|'i)씰( Y?xN(JB4#񖫲nC?_Ϳ|1ma˭3LԾ Ox02Ih24I$QaíOsʺt+oWa^c]cnW#mz=ޢ'HGS3\ƭ !/O^zVzFԖ+effƒp[$~7KxW#9Tz$e{;YSvp*;ڐ Nq/7!Yfi6/[ʂWҲQ>)e[Y"ޥ"%hcXX_$F' 53vwX~[䔽Å$FFy0y9B ;1bf?~>^E-pRcM4O㧖Ͽ0_k9I)HWY%\:bZն| SeM0ol.6;>neΓ&mk!)wq$'+P8^ _{u"ZF-gO !;!K౦B_}FxY !\;Gϓ%|iF8[mUܠzRx8d_@˜7Z7ágy1G1%BBQ9zRq_cBWF.>Jh:-PY qpBiZ@ p€`G4kFʝ>kO?>'`%e\N~?{~碽 UzCOS>KR /g&VS<ı!h{,tNbOHNm\U(tk3j.n~W;.%G|(qY{WWCV$bB_q)w,ܜ౒`t5#lmmap^ Xu黯9nSnHX7Qm+j_R쓤kW1Y$9I6@R{WC(-RT/_#;c㜥(2dBR_#: ^7Rn6꿡N [5*%6R٣# >&Nդ<90|!z^`yv@h$x f4^ԭhۏxe+| #,[ev5HV!jG/ax.m*4e N~;2 lbz]!0-RB NZ0.mEv_h4 $ V젆 C|Y ELD%m:?J:*0[ص+8FÊxVn>Ad@u ##[  FA7]n% W_i >/O J|:ܟ< 6%59l*f ǫθ J~942*vcZ4P/ SAke~^KN{L 1/αCE1/_AKs\0Aةnj!&1SQcXyg6vn8uC« 'p1]3 C8DA%gLA$" !fΌ!}lKݲ#`X/F;gX蔞I!k?+M԰VۻçS/MJmDOd)`B=" M5;.c(z؎8rYVk;H~q4JX"kEKQgZ#֜SW3yFgU~3LOVȎTFP!˗sK2 OH"hlŀZwE`޹qBSvpy$^IoX* ?KG5Ũ`q:Wz\]K=Qlj:[7?L^V]6Ya[.we L,;X(H8pqrR럠6T&>Onviwu@ҫ-fA2ĄLuB(A]Ni4g(H7,|0i"Ő lƶOՐ%?eQ/k>?\/fhDVݢèZ&XNS96)}PATR5/k\e=T|MO a+):2 `#r[K^P'hI$,~F,=$/S.>oW\ a/I6Hvk .G@ IZ֕/%/nN*3hLj Ř?pپc} ΧhYƧ\x*Jn\0*/fʓݿa؈ &|餙Q6i j{#]3W}G$|({O:MXh+wz'tAMWv&*l!IC:9T7%ioܔa.M.l>woE]tif~J,!ؘD;)#zej447K ι+[:!4K+z#[oc1>ؑe1_]<eh8 w`\ A@݈]%]CRQ40g e˱]R8M7fa\FƊ qz:#Grյ5E)G! !#hZ/ոIEYn0є\Df = =a tsXj(#qZBeht`vZuZVS<+nOd$#p|dF:@*$I 8vJYBv4F܆ +M_mjb(ժ8{̨%׈4h4)vpZ CfՂ1HdE$BW3ӊ=*cMFUV U|:r#5W&k @R[ |6'(SFqffAٖ0i_mOER03 \Ɔ.^AER03uvJ+n%7I]_Y6Pdxs+Eҟ{WϪNy^p cUq) fk Ѩ#K9 \f[-X$o{[S }ْ! kDuG9OS\A8^CiQ :;$#hbrWl/_&3կILj>a%ˏ_m"`Y;ϱ٨}/P݂kKiJ"o,~/! +AETs)DJLI0%JEkL#d#4BFQ˜BJ|ee?N%kW}l"VtF8~RScq|:@4YGm/P]GdǾ+T{  "Flq^ku5l0󏵣0kU >p# gĄFrnb]$I1bs<OmD %ϱ1R`i,?yyi-mĬc'0;y1^m üDRĔ1G;' n='83fb+~U=c/?gGw׍o ԦLNIH$\[c%6CsobS@xKn6^D לr{sX#9ϡ=LtBi; wl^ox8R,2nw ,{#t_R_X1M=U8ZQw8QΙH%h$0 _OysJm6l 1Txi=߱79tk,$0DqBz9}{rA_5ӊ{3gǤs}#um/\ SHmrI)h Q.2&ǸI&3Jf1ѮDɈ+%<X0^' Y̱k0RvLkcZ!\ECP<*Ϡ jN945/Sꬠ -=)0[gC!.:pi9 zTnc Rt[F%pt.y%6?>Eh]@F i|*~5:f:rYk&XWP|um:7xAHz H8[C]P(a!{hH6,.d6Ʋn-پ'L wDS}Q թ\L05y D rvZβȉ@C/h&l)I/1L${lyCZAN)( .˅hXiP ~L(P` $NVXR))Q4aGde5 QQPtԥ,uSLϨ8 gO_ B`gT8gaOjD3X,@WZq80#XG:׮ywOBNq ԧϢ&ڰ5$i5j"=f!e̱ 07R K1sH<ج>#s+$yM4^`)=Zy|vd#Ej"Dyxf( ce2#ŅK66M]9nAL&d>tϺybmkek(B8= \chU_71d`_Z&-, $B$81Gt0951\iT3&t"buhN1RtPb,դ5AϚoȺmPaJT*䥷ɽ(4NWFLSş#mCǔdb M aQ=zE}\-Ez8"]Q#4LYf!M@bVP:70T{%'C _"\ s8;gSLލFC4d?3PMnx?G'Sp qJ8yisbHa;ݠp8*J^Wݠ_h1.rmQ9bA~}H/>fQ;&[-*%sA4|ンlSoScA;b1(b>;n36 o[OY8ĠsF(?7i7k09G,zsOVH\U:?}?bʔpL_0 VmXF%#=b^MOԣv &bECȘXqivQ0Ry< teI ק'>I@T00<٥lkFK\~)CuZz5 N݌} E{ƍ.mULd"߹ 32wMKnq&|3%%ٺPϠ6&vo[5T=k)ﺢ_`]>L̟5Qr딶5IpV+D@5,8zXqy {psnHGrQz%{~Lk^&Ԉt1??{)ǜJ&IHh3&X借= 5cM zEO{lu1C9`(W#&zu{l _sƔrL0+5g2lM+UIW+Θq e]9SoٕsKi9 􀑌M͵EV oߠ#6g$臔^)mRns% zz)loxA o8Oy3ʼaVR[q.\R"wl[̘kRn_O2t!k2 ?2[?s)&4A=N-SK;Qi¯~e+w,bYR9^{L2,D(ΰ !l6}yM(,gG|BDF3~CpL,ɟ']2ꩁC`J'PE7lc-gX,E$`)RREUh}B 99BH!d+rql:C8EHA!z_@S>?E.pMFN IDAT+1`^ECQ㫚?@x7wY%<CGdb멮N @innG xW\.b[ ]q2˨rpRTCǺEez>Ík( Bt?sTdL9Υ$m:(u1mΉ,U ӆ,6JVvb)5Ҁ|Z6oĿ0|2M~e PNjso I&t#ðM߼~i̱Zc DÈP$41 b`M;BEc-Bup|6M 4٢7ig`hceCqS =MzR }5mբ5ʋ7nbꐇ; 'S9"%-h>~[ѶOXJ|䘕?Z-) CeQ:%]]5$KSbr'T?`v|hLa0)15D  5\.uL#i?hT8z ʤA+5k!եDW04&YZ2k>sLz wm>4LVޓpOek=Q\loDUh4% ١žrS^9f}AiQ^_ߤ5YHsȱdM GZǀA3 8' ! Æ?ɛqrg'xk~́-[ؔFZE;>C6Z"yNmT9放༤iRP-ِ7o㧿Sz1=f|ȝT3)'6džM,^7Ƴ!oϨdXkoC^r8nt<+>3 l#pq†"E\yuͼכw(W 89-t"[:G< oY]DV/{[P~Mhm+ Z4I̵k|tOIhDЂ_"V*7a}3cژ.jO;ʬ(uo=#*  mr7_F6`OT`9F@_"49H7vbm$lƅ5Y5Xju .|Pb4cnD9(LWoeos]T!rȻ{:-3k ';jƟQ|9#_L{}i%,: ?4*j $Yg.u+Ƀ㺹XzK-- A?%ۃ9mK0ɕ_,wX;d! ğ}ܺpvڙl 3S~v? JԋvK8|XsqM^͗m]짭#sFk4twoQo7=A1'G u=&4J e񜺌V7tO% *m. [ݤLhC#d"Q88…u$EV%AYnl L:YzvB6E0X.a34m&GP ?ǃKm;giWX'A9 H4ȥ c@6T( B!0[˥[9 AE^qS+žv0XwX:] h2A>aGR.XQ6/oxfLycch! =!Fg:(׍`}uB)8oX3ckgQxXu#z˄;.,Wb0AȰe " f8 FcMb3G,ÐuDٲÐ`)D]S|pޱf()1Q?XcD.&~%8v 3ї 9])2bv!5F$e;7-W. Tb ]+R'dPs=\04yGO%Hye5Nm {}}Zs R:%T^xjCGI&8 /'%,=hwd#&I?e—hњB5x{05r dP5`z#D]"v l|'*c#紏Y:0Z,ڒ0 `Ӛ$ם;I|x]ms:-Ҥsl-{NqI˗ x5t^E1RSJa=U_d"D$Mb*ڊ` ^\3YRpP,Lbi㵗7j;J.7n[^͕Qh Cԟ<… sH#AU桤'CrN=Kjxy 2, xlMkPDTK*WMMc=5 qXI>]IUT"Aț>Gyn@P TTnN0UGdxC@mCJ0d`iYbUQlNm=4(f-MFYi5lT0&$ Gx>" ]0bр NMzӿ30S2[ U 6$Q "`k•cOM-پCihN.˗-Tސ=@I^jY•s?`^ȋ &BѤ&_2qyזy]RIXӣxJ7_L\F֙BxE&sA 'XJ b)*z jFzP w9Rek,B)G {BNЂ@ֵt PQ>*P59&6Y4 h>VI r1^1'aIO)Xjc]}X/3*}`be;ugJca|]]?4 OtӨD'5I_Lw ^5 IUcq3Ctfѹ'BoSǫ.hdǀi_)7XM?L^$bMޞ-Q> ti 0q)%-zL (29Lu,GznE.W4dl`CK]Zb gZ:xY%5JHMܲwu /5r[UT<>9xj@M7h|ıپanB:BpظlvY%NU(EX E=$W+{hыUp. !#sI*dӼ-v=Œ)E&XK{Um 8 C1A N@$xf8>A5ry&e5B<$Q0yh-^8 0&=op:'@Ot= Gau$!]oEpjY&9/_vRIQPR 9 AуW.Dd2ќ#aj9|pmh V'u71Qsi??`3. /9W+aP\gC-*]EjQynxʄ8%q8( BNkzT)s\!I[Ab70̀i8B8(/bǜT9Řި!G4L2 cFa|>Z2!1KIR0&iY҉B2;fk*JrFkb@ c2桤1u' cmyQ@KHBq\)qpN98C)Z7Sk_KlK6!KcM}(Pt.|,k<,Z#>jo|[5֠0O,~djBʠDRn"lQ` 2&Log ͒`_ɽ)w(Wѧt5c9'eǟpE+bJhܷUM]zw`rmڴy1nBIRfÒ{_VR%6~!ry_I!4'au͛)LQ|_{&CMG3U|# gNX0gsB\'h,gű\ƆSKv߂ e댂zy tF'+ B,%aJX<e0kiMEBI]?́-篕=K<;]xloџ>€L!O>ީC &EiDY6m K"MBuPp䔵1Tu@?R#(O.u`5WlYU:O2POF50Ӿ(:sF>R@ZsflpHɽ]yWUÉs3Ƭs?d8o t]dM 68fU:'41ː#T[Sy-IsH rTvؕq``v..o8޽i`,2ky2^l1\a<Sp헋4Kg68M5.ng}t pk/\wdM!jlMw|}~3}č6 Z\av6OyXiEі`d{31Y88VBv&7=Açc{  'n5|ⲇ,@Tu # HXL0M6{R6gdd8K2G_oקQYF2/7x?89s3#cv2 (O0`@z|y \܄7^qR^ )8C~]` \h|9}STx?~XOW31u'wN/ IDAT/8~1^bB|m5O-?لn\lڦ˪_W5l T*fH>^3ܯiTxo BhI'TJu{36*O >L/_S+EEQWl1[ N ꧌&L8-v}q%#;@Ckm VDžc&GrXߤlU|'f0/qX)'k"l`oBq/ot4xO){"$h 3N.TVCPiN}C&Ot^h;~Xbov‘?6g5~ZBm34xU2r:/Ru)rNW9s(}r]g;,ǘPa uufw] Kʕ0 $,~!GoacVl(e @'WJW2fKySXňAe15][epc%j:Kvۨ%viTAJI 󺏗un}-qM?jxi ]-Z{2ѶT-/=i:ÃWx[ H&'*sP>njdLB\9p2-X`0ldeeIrGaJc6.{ Wz| rWIacAq^}`cDVXfĚxFmۙDZ>{P`/|+W39cq (h¬A%MDup㒝ݚ&xU2[Cܻ|N\d4)m=Js pZj$lO0e8d7'b߅E}NPEd V(dQ9R[}!N<3 SۥS_א3MKhO1Xْm$Sz'G< pRZ"嚶Bo#FلBqǃk2EcU( Qbgp7nD]#<Ҁ3 pi;ӊ =Ť0ً >1 L!K+XS!uΧRƎ !U(wA 8됱 fBS +ζ󇈑n/hez2Pc,\5UA8`Jtc 54ƩƜA הZ;j͘)78m <HGj2Dz׌C/" a@cQXh9\d,zB_cZzI6aS-U\q>` CJ7v3^nɊ1 .66 Sh{} 7HF/&9gv~jQL'ؚ7s |asyOO1,")VaY#wl^ cɁ*F{'_L\VyMΡJR}v] yBt^4_|t BLARPHA86lb:N2MI5Vj5PfV =xO2F=j^dYHí{}0R% f(ACM'5]vrZY"~@ `5œ\ 9F97^'Х+Pw" ݑ01 *uDRc #`M,CL"w7nӺ` bzPW0AL!MjhJ )F.^sT cIJ xO }\F .F7)I=7Pztd+B2T48.pVhLT0F,/46l}7鹗_ ׶GF0厯)!N-E͸%S+t֗!g@&}l]K]{.У"ErbV+"ТCWX`M!0X&fI[%acj{/<}ϸlD1bfatjNL1ky!qyCS觭ш/!$< 2GRMk7V?-Q3׮K9f;Ä PU3 We2BY$d-@F 'F\ބmD" Q"4 f9:C UNe;)=V,dgՄ"YϷmN35nN-Ɉr#[ Q{!CgK}' }&SHHT1) ,n*iԓ:܏]nrQϫb&]ѱE+|=&O iB0#]/T)t_^.ܼZ4&T8gH4[*Pp: E$t蠋H UGYBFа?lYy 0ws|'*ƤRM/~v GWpx RCOg ~칭bΨJtώ %<\ i9EoAQ8w|9}N2>|N)UU1|_I"Te}?7D>*:\XDmI}I9|9p9ߺy/j_w f뼈h+76n@{~?OZ{>mwa>7V_^]{~&4$I6M}|Xk5 + +8_=6"H9oۘ +oK5yi5k:IdI8˲}P}VXa^>C_E5h]_ďw"<Ͽ {+^aVXH8v:B^cczᅱ'oE½^Ox sY + /4Ze֣Wq4:ϟ>EQhV_%̚f Un}VXa%hl+. ynl6{[}k|- Cd23]E+ +@_^&iesiUU:Lye|H*Jaw}VXaVgE.E!pѭA#9=99$ {=jU:zVXa lwx[Bn#Նϗc,ӣgy|$Q}- cNJG +%ߐzE#ZAVUU,rv8s损ǫv3;Uhkj5ױ + ;q>CˑKM"*Ӫ(twwǯߺEiqZ/X/{p + neS12^C|y?Eŋ[[ny 2_~__۷`<kYP:{|>pض#i]ךyZ!n{oC6mUmo&`>VUi}ؓ +‹3}ܺXϝY5Xnk>7"p{礨["ֲ,5˲F95d2`@jѝ>Xwvv{޽{/w!Wa4) PgY f3V,ScLhdYBw sMd"j9cr rU5KH&"NU8 !X `#")t;VXae_u%+F,UhTc

kegzL&9>W\90D铓\X;9Y*v٤Բ$"ɩ»YNMѯY19# +8x@ H)Rb#"!haO?!E>Z) jgYD$xC]ڦ{p8pp8Խ=mUU|>W\z=i&!Z!s&ml!xҗZZ~-qc1]\#\8}4I?bG+ + hZr4D݂ ˜Bh CocR󉗼vDF!AUCa> ι.m'''UU_vM޽ <a^w&%m9hHLrnߖ|[f\8;)m =YPkVXaϐ0 na?.@"bbԺD`'H8Шj[> !tQpXTt]g>׵'Wp[w5~2[NSZR$lTpzG/""iFsMcۦl\f!jku.͊WXa0rFXz%')YL$>W &q5!F)^nUR\!9f`0Hyww ^u<d2>-j‰b;),42M=TCBKjZ#!1qVYR= If3:`k_3,ы_x%ZaVxa,Zdc+6RTS:HI$)u"6%ݴJ( /"=)˲lu]3z:*`HbG?m]xwwWn߾-ryܔɲL&39z=igujCB>nlo[bkUZk#t(3HUC¬.s" +8篾!ڴ/t96 Z%Hg~E} ϧ\#"~>w4Mh& Pu*lnnrJxnnnݻwyя~KQ7FH+BH,Da!Hm(/|m+TntGlVAIAl,x~:z.}VXa=IG4,]єAl(Y"V m #(1&KE&nor2[tjU@<<9== u]K !s*,{{FBm4#D-D]FƘ.;m. d:Ⱥ7;\Zn›bE0,v%"-CSؓL;0"; 0Cה+ISR*&ᤁ7t``Vò튶![ֺ1hc|4|>l<f3Ԥ+N*m]{Zϟ(EQxJZ8Z(,CrrKܼRJ,u,E ;۳ 8Z2.i/ [<]c†<:Z7ləit:uZh4J) ¯V$/NW||~Ӡ+h0l6繟N\vUU( psTy*H6SN~;sJ)EAq?_z% |1+g!$auRʻ/o6i!GnزDdY2 Yk}Q.4cbS ^8}=r Ӱ6wϢuS3_&<}X7ڍF#oq$s\.ݝRt|⩻^E;E@*Sw\&@[nb~ seqb9I-1FC9ZPy+eYt'`JRe+rqSXYv"dAO*ަiWU?֚vaiey,Q mDMֺ,˼1Ƈf`rJ)MS_e~_7Oiv_i".˒=??i6Q1UUF5MCEQ~WEQq$4@Ip t%|,r+͍l6s'`24C:Ki8Nħ MUUb Hq]qI%z5. XNNCVH8:Ia2,\>C1_X8-`w:b+b༌y82p2ʙh} T}X/ݟe˰tc/|c˗yÓMA 2p2ʙKZA%,òeX!K{.o(az2cL*dS>ǐAE_HEˤI;4 } WH8/ c1n!a M~.Mx/p+%e 2p23$s$nA2$sl.px/C BNIK\+rAߧ/ЗH2$۔sKM,20𲈙̼$k8%qADᒵ//o ΃]IR>[ [A\iBk ^)o*׼~-  Tԛʗ)>z}g'^ 1 )?.x*dIENDB`3-3.11.1/doc/static/web2.png000066400000000000000000002031471503346766200153630ustar00rootroot00000000000000PNG  IHDR.)mDbKGD pHYs  tIME   @F IDATx}y`{]$7$!D E5,"TQ6\ӺT+]p VXFEE-( HȞegsl!q2w#dj,FWy5bakO 0V @ ķPt=MjYƔ SZݺb:L-QJ\D!"_<SCt'Yj x+7+ǔͿVYQ@e,>Ro@mu2Q?G#?v|[V)%ƽh\jLI,}MX!bSoə'}/|RF5DEYa0`@#ŝ0`G#t?d #NJ%lAmuvU 4F$)'6yXd$PL_>*8'H4,Z3u"]?G#SMUQ}iIY Th e Kfv87O$ӓ65kK2]k)ǘzQJfzf,Z¯ Y<ՄXv 2 ![PP |O$BG#ciDN03ܺe[/%ik).>&7;„3ڦʯhyi1s7-F`mUjV +dq_w,b&( l1ۦO<`bĀ$CP&Ll@(0@"WΖ0zrw+MQ*-G# AnVHmE0:Z'ygf =0iDnVZMCv\*@JU4wnvZe؋1J=iG0Dx@A dƃ,҃bb~5 3>D,~^BI1ƨ Θz1T_DsJ0@WB#a09gIIpy#q!( 0j@(0>>g?G#o?0!rh4DLy0(0`5MMt7W4 }5ڦHvZRM~/d uI1.B{*q~1^1G)%,L^)kӥB2A˜{9z ڌ J&L(aQӣ<#?ir `L1pkSKpkrZi+pS3zң ;P[cޭ 5 9/?-u{2󇸓@"JKʎkwAP>J 5UPb$QA҉*żAZСd܅KvԮ #?G>@cOJA#73g05 M8+*ƿn?rJYjVAKݞ=w{w}ǹ%#`_v6VM(JB1f)Ye !vAq-uCz';Nx"G#1FP3ƀJAjm8e9/1Pb (s!2w= T~`W0ʑ|e4dI林o? OR)*81%s'ٕDMGaOs ܃IE P"DeZ0FLe#|X&*w*EG-wol //DB֫C#7thIƶ=&`AdǗgR0Jy>(X<1hS\1ǟ2.lbB7Sr% K(xM(P .2*JR`` N'2g Kk1h*8꣚VID),G#{?Z:[̙U6(ialwɟs"FmpۗٗID1ڒُM_GxMI"!?|jQ;_L,HJg͌K'|I3+<1)h-%hmB-Hkiv$ciF1 o;ޝm{aMw;)ݛwR]ISw Dy~lUN|)ǫ΀B|k`@ȴ&h_^x3# '5#w\ %8 7y O5eTW`$9#ooYZp/-9/Ӧ\ńRF FE]d_Ò2ƮEf9R&RX~c(*UPceG" U'8wI#N<&"Z?~ZQ/$ w}_BVq5/($#}Z@AY<9tJ4@Cd"c jC8 7ON>6`,tRLpsPTG  MCȿi֩U_냆C.M+X\'N| Yljr3')dp#s6gZv9?{7]mַ{hhj'ݝGA' ٻAV&= `'jb8g > )f×Ie̲ֆ9 [ 9c҉C\&QU5=(7u:cѢE+gϞZc.ǽ4|)`1\p7xt2R?RV3d^1n8? 7ܫWT]1 #{# D}#; C-u3/ݩ5 ,x.2#s\.4/1c$tyJ<.T_ϊX.т~ ljy،zo#TQ}fN9*LoԦE&EK8Q/LLWA ΢:4go#{cRIE*(V-icxdʵ@_x aQUP0׽k/?a֝Kv{t+~fU7?w- }{S^e|q=.x,fM9wi?;Ż9mϘ3r؀#MǷ~yj0vQMRe{)]F7QU-!螫V^1 1 vao?ӏΛW=w~:4iΚxRފQj3'`OlAX mN4͜@flڌv"5~dQ"ފsJJ7/&2'U{3UQVRFE@>`TfȖ?Dh?G Uz#6V:P QmK  ѸꪫJU<~Ɨo5c;ƛn݌Y߻+?!M"{}9?j;}Mrlj7ƝOK6wu?հc5좛_7̪?{dLS8邟7^ݹauoWkzdȫ'3E;aأ!3I`s;s "K]OT=.K 8hL+,_4 9Kӫv8M8pV͚u1cF4OmVIc d<IjM6p$O7M{[bQ{T(LRЯN_~rͺ6]UX+>Xf3cUWDy@aisx("WFL;=D2ʙZ]Ȝ*wLRPwtW0M{%+W4iҔ)SL2iҤ+WLs HZa:V]pw*+ï#!FfFierɻyCʴL ]G4_7x̼v^Ud( 'T#ot&(ew*f1i ]vDTL ԅ]y t?G-[z=Nc]hhx;8JXI7z%J X^q߇Kv ݓ/W,oM ?NG̴x2đj\6@׮avgeeuY8ӦM+***++k4b楓G,_ԗC>B _s:ߜ'^d νk9p IDAT!g_Y;)"DAHw#SWI%~Jв((a|/ W?Gȿ?XT¶mò,۶#eYeH$xܹM Ԋ5Mc{?sst8GkEP4<OB1@ 3wuinv4o!0 4 )1;id4SO(?!xxDP" yqQx\"bTc9_Gr[J'0\6uDF<d'6MZr2d!MED"sB3ëvw4ۜt࿿ד'f:bzS#nZQ7Oy=]]ۂa q7G#{=%H_6L0B Dͣƒz({:W W:.I"ǻU'Z>SQb8FqEW1;߱iNw9Fq'}i]3 aOZvgCN&}.o]{0;ҍwLj/_Abk~!?ѕbV9|yI>cZqEn'wtX# @3ǩ.L~P^AVX)TuZS5GiIxPI}\.#=5t@N *OJ{WƧ#;B͡CL p?[{5( :At:wXT B;΄Lfs*^SݵLNѴb=*!VmC#߃8 J7I Ǐ-*0^UVr1TUW.9x"GQy;L=qlSA}Wo^}crʘiG4Btl Yڴ>@ %LbvXQj"mc=y"L3J2*ĩ)*DRDE@93_hd* rFUT.xCt\$-M;1 G 藽 [@ #L=5vxRZLo㩧hЀ1;%zh-x (*g^ DUv#QF"K1'4J͡9GCd;yVN  -DO+⮵Fӎi\ WZĶ$2Aׄ4ux=+O.%K,uj|TYI3U44 MCд^lC6c~VumI&.QblŧN9j^3QNI֓MxYM!fN'?P:hdg Я2O\AS-? eF!c@44 MC44 Mfe{gDˀÐb5IE|)cPxLj o^G%P!*O S&+Q)4 `LP7$T pihihд1Y QjX1j1ffyYtsTV/bgItJHV*AupbbΕջz0zP_892=n30 CNWE@[+KmE(*ơW6j#V b'ihihZ7L ,#(71E)2j3-K( Q9uׁȘ"Fy`ѹsS&j$pE$T/a"{@:N^fgV{w8QɎ5i4H`Ę&19tXWLcݸjhihEe"@s?Jjq[GcVҺ!Vy8ULN0`.1Q(0ti(UEBmL0b @  PO>8zd6z6jzN" x@ITOP XCRAmR3㙪XT#82}ZP![VVPWC#iWL+) #E(P^j=2Z5*QThh)uCe+*:Fxd*`B.|D9KEx(B{L @ xx<e#>RLCcSC*|BTyx'~8BG ƗI,<Ų%ILW#h0zG]9mpfwX3RihjHt'k-'C4,NSo5!24ON YޝvA>!һLc6@ 0^y-[֬KM]C#R{*#jp UשNĜ`LԺԩD)s*D ~9o+|e\ӯ~}y}i/]:i1m'8 X|,keء:WRMyCE%yI'ڵO?[^|/z{x^'j3(35;bi89NZo>㌱|`PthOQD+,777wXjC@,ܾ}c=_x^BHyyy(:t_?pq<5&N7MH4kڃ>{mڴims \-84>" 1]6[ikk[p᫯6eJj,/rN7Ѷޱ'cw-<5Ƨ-|u #ea?]5-a\v!n!dy}ŎVDRJ~4qTl["I}?^7Cƕ5s43@tEEE@`С[lHYӯjsi^ySLq"'V#x&0gU*[K[oOfffX¶",g N"zcT>N`]ng Ԣj_++ x[n O[o;>^o8ĦX~}aa?0`@ 99Daa洁q#"666~mv][j?۲?#N86qoر'|rNN΁>UV@ `MƢLPxfi=кuHJJZn#{q9#AvDl/lg/f*0eWq^3Jϻx\}ig8?Ȍy Ϙvv/xdgx"AE naϞӧO>}$'(e@ix5ŸYf\?@Cԩ85>3;n e˗?S999IIII>o5;;R=*8N)r.fʖ˜۷ovg222@J `줤+1'[Rẕ 8p;wLIID"a04UX)?,cC=n>}|>0|>_>}֭[q{nh4~$"[1Gy/?/lvʿ?^[W=~ܕw>|ʝ?~s_qw;P|W?3.ֺ{U@2KF_x~ØlW{KӦg{U'͙&@hk= lo f {ޖ=wCs͵]`|z/J]uvP7d.}p/q瞘lO঵{ X@ :?s˲(C cǎ}oD"۶?{,^f?<3F8egΜ귧F,ۖH-SD4g`ǎELRF=88^mU5’d4ݷry|>Jex$!b}Օ_efe=zȑ C?LSi\+i>b[o :b~cFkZMl* =8^pn;9߮bϷg\q[QWӁ}S1OloϏK: 7Ohn;oBd+m~)iGoWm]a^2_+JH~Ͽ7feN|5o}0I.4lXs1֪51UK?x-947lX_g`~њ;v'`#Gw߯vU\\9) F<-"6SYDI6̦uFzW,ʛvbwߝNijj#>BRRC=TRRo ڵKw*B!ine  @ǛG9s(JGd{qB|II>hN]2`cG ߮))vZ6|Sg\#4.JM4<2Op.>M 0ah#iduH ?˾1sV0'vߚ/2󀺳s$m. E3G͸G(n|uܫ.پvkm3@۶Z*vؿ%?=ҟLp\Gw݆?lE r[?+o߾ݶ8ydXB5SbDIM'U>ceeeń.1ƚ֮];`sL5Hiniv\ى]v$86MJYKsqڄMLӴ,-&NxFI BJ*D/"38 <׭[%AFsh֭{:Gk4tT ~mgƺL}(b|?=׾)Z>^?'^m<J v3 Xi$ނ SzMs0uܨ<*C'Wxg MkX3l<'2Xp[rzTɾEµ3<7 c[ƪn<'_bw}(ѐ G gΜY^^m۶ɓ'%%%%_zĉ 45"*AjIdco,]lF{?e-_|ʔ)фjc=lq*Q8Syx㐛{i)c@n O455v\<30!aC9JMK-N2P8ÇA)믩;pE R)V2]2ۗ.]r B#F?1bD80\.ҥKovn4/*e"ߞK.J1iA;+, wpn"OY%kU?XWf Oq\C4۬rRڞʞݦ\,x̙STT?HkkmQ- &D^(&W}@ndо@vv6r\rذa)g.Ϯ嚌L5smmmmmj6ֲ;/##3|#n wOJe@޾ UЁ:0}Ri浻Y# w@ „(--C IOH#۲p( ?P( P0 CP8P}WL CĶFms=v[[[·z(7/!P 3E7P+sd{T.EG'''Tx.>YѸR:s%_0m:3򪫫kڶ}1?&:TQ2/ض}TeQ=4x_)0[Ȯx7?K:湹4Oz;"Av=骬=*Go~`Lܾǎ*?<'?8(WC:$`=s Ɩc~Ǐ,z=19ޤa%խ~bފtn_6lz{UmnIQnfz+\Ww6'fi[};~4ݸ_ @Ir{WL=cPo p챎ˮٸndV?}??Dƈ#?裁 1 Ո6 3 .]#4.lLlӞRL[jɝHxĈseddضϼd&hFqapStdyncjdĖur˻+7|чe3)g[T7M ^e={ZƜGkڅ>%oV_2ד=I'^K;.z)t/wț̹N+'˽Ut>|m=Kqț/y_vmNduS@qm_[RǓ{KimX>MpS©}[|e/S|B8aLT:щLD nRqƙ3gg|z~]tEɣFӟ> s̡pd{zRfҾ2QU_½{~_ǎ;̟?0`*g#۶SΘqqG*JS JrYNH0p( ~DU,($`[M o3]r &3hРC6Ԥj4;i]PWWMWu`IˌSZ^zm%+@ o?o{nǎG;v,Itz뭶:+^yCdN d#|#$#ݹR$W%qQ@4lb`Զ)a"gZ:s4ιAfZT :o׋{!++[nNѱ{ 7+KDP#B^JD=}>%ZsgޖL69)q#OSuLM%|@D=W#pa􈹺e.QybaN5Y~[)~*0NK T{ A8(¢=8Hzi.e6~9^!(0FQFБ6K(o$He9S 儶G3u.yw$sZ}bnIKTE@ 6RP~{iegoټ@ :l-eGa&dSE"+3 ĉoVi^@'z|39TMe&$bZRHr3j7D57nhS*"Jѕ479%CL>.AM 6ڠӫihi3pii99`XL>GvobL*eδ+x*JZ<@-haBo%KeG 1oJG!L ,-mnkkSedM'{ ۫tm;UWqqz}9 kihihavٽ!V RBcʀ,w11ZfT4iZU5Su-/1k+oFQwT+FeӋj1c"@ihihkZo9D]˖Da{YTlɜJLܟ*1J$Z EYغz3&A44 MC44 M @  OJ(K3ge@ ]o>ӧ9[a#@DaѢEg>mO7@ o5{~@ DOyC@ DoJ1@ b@ -B)@ qt7P!@tQ!@9 1,@ =RĵcW @ @ J1@ 0V @ " \1b@ `ҤI+[ZZbּ{,D 8j@)@ R @ V_b֝ !p?4 @ +֫uXEol: };־Ģ xn= +zWz1jV-Af/ @ϮUꇏ B wN,{/@ѹ_lZx.<^VXU `1@ P!~=P6n#{jV v/ة\UlͶ}-So] @ (]D>タn];!{œR{j%a\`@ P!:ʹ%+ eӮ(Z`njU=ƪTuwL+F@ >ŷC1|o֚ooaqIIIIqq'Xi:%`~ao'mz韟r̹bY {Zlpe6~pUiv6(>~aUȝ0:5Uj#ւ k:ClZbզ}-67hpٸ }͛{y[f2D5O<|@ƨg-Z>U ~يM*?6{տYU5`mKk= ̹fx*7\fS%((=^R=[W,]6$8$XZR|!~3.8<`ByQ+}נ+nYlvJ5jEkm+nw(^o9@9 OJ9s6Uq7nx@ oW4}lXīOQaܴ? }eU֑ݰglmwVmZ<ŵ;jCݪ8MKx~ڐ:O,Sծڪ%/oԳٕU{D}?/qw4"-b>yd7p.X)n)RfVUZGVO sܫOKWE_i'uKVTYм|O=wy&pg>#Ͼ#=z\I%pp6`Bt5WoW Z*+[H).Ihްm-)æp=̽vl^tk0TNrŵ^=q2˖UV.q7_;l!^vPï{Tg}휹s1 UG6lX'sL0nxW2sܛgD:R_.YQl4j7pxZ6s1֭ӈ@ wGkţ'HPiJHPe; y 0`)|cNJ*ŃW2~I\}|ْ;m 3G?ٷ:{PnP'Mq cSKS8ŬM-9,j7U\&ԬY=`Ҁ +8>޶*֣*`yyf^P$Z\W3JyhZKMURS}> ݥjg{G^VV|ΠT3r7Y_Z䕌6㚊nsC B۟AY8\b?m\"b|V+w3KbN/`U1m4l^ɔie(}5vfخ]B'lhBfk y%}޵C՛b!yy&4WUpwK|8Toj'-DH-.ͅnnvb҆2% Tg'ݥ;$.\`&11Wz坥qC ;#ż4CЧLӔ֑ݣw-i+\\sG!ݑb=,Y~Ѽ7vVٹT ;k=++#zkU\4pp69ok-/4vֶTT55+yDSIq2V'GTMTqC Vf `uOV[U+W{VŢj5Im[R٥*RKSޭmʪjaZX-*+}5ଇT*@@o,qoּ_A_}wR7ؕr\lAc}=/)nXLzAVX5bT9V'κvB|!y.5k*C œUkz|=gUӢ8gNJUsD+L.ݥSW9nKEJV{G}E n@)UbmXtCU}s}K6؜*2*6UU8W[Z Ɨ]pv`dáf{b-]*yj,J('|ɪ`s?Gy8d|U5[W,^G(z)&'kׯZ]S]]=vqK7UWWZdp-7 @ ]=9uW<Z6:3r{p9-;럟 gu} Jw_S6VA\6G,f_X_ь{m}zVͿ(**jEM=mڴi_D1D1D1@"4b_^D1)Q Q Q [xsQǦ +r((bb((v*ɓ'q< JD1D1@@D1D1@@D1D1@@D1D1@@@D{UN>q8z=bw*o'./Q]8!`7Sv ߏ}Rь-{jɥϏghy.ŋ,Q g"ƢtW0,6);/׼LFbYeaaÿz?ЌyP\z63 ڮg-9w{v?Lw()QӕN߿3tBI~.N5K9meӟ p1EDupǰs,)ʯuQ W_IRND"ezh/"ߊJ H'X"܋>w!HDT4od9i""r:H="i}?au/a/a۰v |8=9-.YRس7.'ކK+=oHt1!]YbaF].6Y*Zҗ2_\1y)$AgąkrrB!pHbY+HRn->𶥳$1ED4"̦KQ:qA%l2ьE,I-tR'.3*Z`[TLDE6 KDż%?%uōAMs.ADٳ 9GHb)yϩr>s (fk\ʷV^n)KVb~`rҲgPHt1g]0qr\[4̘;{h9LlRnKfĂ$(6xG,2_0 hGjõ7QQZ%YǛ/wK%gvѬ%Kf M.fliI|gs)!*^dw.!(sY2")9K_uWEDTrzY.Z`]i={ K(y`({:oxw "ʜ}ЯrXaɬO+?:mqz(s*"̉Hb@mK }/~?:r%3Flog]?ҕ˙,=5[K.yo}I?\RdDD3L07>/bj.zzଘK]Du2M~f/]wߝ_RDDYU;ۼܢi3˿˙nϭ_155fte2Y2>mp.3X㯌3z2Wr^=F t.҈$p/Xֳl࿪!"bjy ?iYaSXV`-Ң?^4Q]ڸthnCmE7SCbwz6/zֺHUfNkeB ruh-^մD"kʪ} Anаձ[Rd*} IDATqכrg4;l.˶ES*Uh,nߐVxhn:VRr^嘻բ\@G 5qkn ql5*؛n@Z Ȭ\ҲA}h6?atdS#2~u;2k?VۄƝvߺrusgww.e!qns!5QWl1;{;b3Vm^jm85Sh"Uf3ϰ_Q싀X/Nz*x^Ϙ*=-Nc^kV@%;|Q"۽ķm?8sEbSz*|LxeccoˣwFv N=:Vf++ }b ,KpeYfἨxf1Sm8 k*9X!"\Xu;xvjHDYUg`sL=}Wd"U7>Ļ;Ek(zͤ14{қayha}l @mE*XT6M2VT +ʡ`20#f}3-ctnx1R*n]Ĵ;RkbH9/+Zδ8S;Ǽ˗ bEZ͂ܽ{3ǁC)JdڰS1 g۰Ŷɓ1l eY-)R ֗1M͒:VTuYCsW;jJ6i컯PRuUJ4w]vwosPPS βҽ#x2F3J%\ 1'ztJ:3ɎW*\ܫx~l}ӱ2oPCCFLTrva&W&9mH &ՁO0FI'}TW'&8nv_=Pk1Vw M\IQ`+gv8e3٣kUSCŦ*fboI(ZumwWdU[KRz+_nL*D$mi D5-|eCd{MJ5UMj{{tA,³QӶ*"QT"S uu,ZC k۸9TilTZytr]$l8hjY*z4Tu#o[~ܹs$C] q WOJj"EeYJUR9wQlw;ѳU{6h&,ֱks̾[^5ݕ沙zfN_]ٵjlܟζE?Пi2MU+֌~H9孍-J3Lg͙ckM>c2o{gͻ=]&9;wufA 75ޘF hbJ TpVc9[3Eg^ sͳFOSZ\.s\./^_ÿKw_B#*,](k#/,^۷oŊ7nȑ#G=Q"xNN>}Z6H6mZip~ӗm*)֝I*r}cl^38)_ Z{݋bw.j@@DIIYޛP_>R# VgPDȡ~G9rTك[7(h,A1mh35ո أj Vh4VpxRŰ60?&1l$|~EQNpXŦFX$Hbb.8"PJm{>.b^6O1Wd{w6sgHeYjkAD?8i&lwQ?vhEu-ރ[Uclx4B$y܈ (vT`#WV:b!\J9dV\qU8cDUGX$&Quy pouV1LD",n{^##~x)ˊqI-,E&닡&peQiN {yL-WTz\F])9θ#" w:ŽvW;;"kwER^3j ݽ1;'u'bQw\"ƹ}:?yѤ39Y56706^%XcR Q%R5$c}y܎*R4LN),,):1?1>3$qvqUc`Bآo{(pKò,|Hő%["X06YSb8%b8'H!@zA[QfX\^XV x͐"+DXh"m!v/h4yO`,hcǻ.xk1a1m}rfȈ&I&bo(c|r9,7BG`8 +9\3^1`wdwHB( D$;ws,Ƃ^$"1a. kFIe)2MMa_js;p@qFbP111qW"v63ऐMxΰĀ+D+CR,"q\,C mõbDt|J|; ȝG}vV 9߳[t' ٭$VU!1qVq] AXcJ!g?J QeOs=1~]oz]8]J Yq g v8S%I!b=ym.Wf7byE?7CFTY*xWEe&q́8Z$uxO88X1߅*"°,ː2݃GR\"1J)Xqu {9ydw, F<u3*LΗq ~Om m˹t,7P2:UtIaHe0q@L%ID%N Q6a,n "./22{=!Q%Rİ"GJ X9;,*Dynofg&B~D[Ȳ,KȒ,5p{B(=wx¢,-u1,%B7"ʲJ#bķ\pBe1S&/,9vE@)rE-<h"c'!{oǮCt]?uwwwuu}嗝.]eŋ$={V~i[[Ǐ=Op۩fNqݟLHgZG{¨]OxLgO_ts}QI}xmϻ ͼ2]tyKњ2 +M%DDgt`7\g({gv kk̆|]uV;F?f.)OvhX ;[c2?IftL?=7|6^*t>7Ygzteow_\ACCo|c[-[vǏ}駿EQ<{$I/^eҥK_~eWWWww\.w5͵~===y\ի\._\z5sトD p稲Dg]&le/^&Bn˲s?D%lL^%.'[bōûrȑmg%^jӧO>}zVͿ(**jEM=m4-8$p7b"Q<)x.6ŚlF5і*8|2kWm$ջ|u>Pys895Y:)&Y.P[=S?*vPez[-e u}Enx%OHZX>tJ!RI6=pl!b5B J#; S+^heX&#g1,ÕyZdiy(xf1Smm*9X!"\XŏqYUg`sL=}WdLZ~pKRRZh &fOz ,-k;gd2@nՖ-N3/m(˞Z+ln TW<ǰ&{}@~xuϱ _oMF6U k@PvOg[Ѣ´YT_L~eku`UH9孍MhF&e7GX=S]NYO*aμ _*ܴŒ6|2gnX"k!Z˜y-ߙ3/ۄ9vH=jd>xaaޜy}?Fy4y]49eaeL kzzzr\OOիWs\.˿zj/}i m3L5,ѐ 3h*Tia{7 jb_.I][+yhݻJA8*j ߹*ѾQ vW<32m{Ⱦ}VXq𮮮9rdۻ%iWZӧOj/ZmQiӦO6M=01?#]M&={τ0  roCa1-sU1*jZ՚6FZ; ׊  ]s]+[`@         }D;Wɓ8rGyQ G@%         ݧ/J"4Qw޳l6J|EP+(p W.;qb&KDE3fR2rg]=DT4ðb V;w4_&ҙ1N\%B<3Ya)+ ܸ%Ñ2#?<5Wcb9)Ň]f2 %ڮ_{~ie ŗ0MDEmXRN>|型\gK 4u=7 '.\mN>i""r:H="i}?au/aX+5/>5B@!SVLh.ǣge%單߰zJ(&R&.X̒OFZn2#N\P+,1eij QN>!ьEK8mN9-vͲ{-~""J'A9 &iZsKmnؓI?aE@Ar7!Q n|lJMUpzI{c=WnCGm 5퓣V빊PR%"R9XF? tZ`8侖GÎ,[.<殪DLyHuizWH]1SClUT!RSM}˪#XaSq,Õ9} :v6]Ml}!lZp*|6e7Y;**؛n!.l9ܲ͞iDr77=&C @` e<2|5T)5;ȥN"UI_Z@ĭ>i6rF e3*hZ\[0y1m]9G3d)&*!"M %=Vrc/!Zc"JWp[]$*1xXb« ;R*#DgaHWb'10c;yaYw**c~$de{ソAbByU2vPqhax )Y,˯2}*.S.#)FI #>}zook&V1Ԛbh8*!!#&˩699p"f[lgTm6aYFoyL9lUBt)Ү0,272fw}d7mۨG?p"w"VdozyPST{774[}QGW o2 &`2(bO@&O 0u|1uF#G0MK2fڅuBIHk,} eTHz#"IΖ8w($6lWB[ɍzS8)&ϳ{ԧ_q+,R:)YL ⫵c)Ynnz.QӬe!بjLlΖ"S uu,ZC M Yƣ_xڑKDgvCg7S}zVͿ(**jEM=m4\?-~woo 4_'ϖj1oVL:T`j H3}bgD"qBhitt;%y؝+X}_W ^R-ٟyT삽D[hO%Am05 %ʞ?9ieUkt9O*3_ϿW;u}7yA7O4ѵk 3?o޼9?vzsNfYx3j8:UmZr':Ueh[[nhh؛9kyὴ"mMpFZhLv;f,[\++BԊ],]}r] `JAF>^#YGzu/UNuDkz{{xgq2|\yN 9۫[wy52w9dnJԷR|V`LM:Q֙9eXKh3Fex/:qy1m^9GkzLY9&%-eX+XʣbFK/({^%z`}ڵkJ:q.Z$5\<|;w31{Zo/eOn}f g o#;͕lr*w\MօK6DEU%ޠ@M3M}^ywgow4(Z1@9r9R˴u""5%=0OChH#Q]k CS'?wyu.b 8#kUl=(lr*vص* (:t eUE?SCU(QfQ(2r2!+X,Q!O<*Tkt{*(Çc'˖-N@ صkF嫗zlkQ?{xq|}mK_Go͟v"ȅ,\Ko~??6[wsjiXebt0=.)b|{շ1}b.+_]o<}S}/6oߟ|NmO[z=kݮx,NCYeHm6Z'p<_VXSU-_sG꥝)YhYǮ17So7VP`*@o# |ퟭXbC~>ziZw,!+ŀ7ZD(QV;'1Ge-oFvٷoxQ+6b8pbY4L8]5`4ױӎ:F0UAc4 <сy>Fc J&ߒVsO7 Ql(6Ir1R ISVS5sG$-$u 8OJN+[pQgu xGǹ ;F>nb.8"P2MJ83 gZyfJ75tZ8FaX+DI&K$#G=6N0B)h|{czUHY536Isüe@$~ Vٌ0mո/9Ax+,J'G&Quy pouV1L*E<6h4 8*Q&+qC`5!,o&_wǥ U ͯq=$O=*JG;06DBDիW'O؟'Ϩ۹Cq}٫!?w>ZS[QZܯbb}'?>+3i/ o=VnxJ*1WǤ:Zbwoog̭Hm87YM>ӗkI$̬3yػ;#n_\%%,M"0D h 01D_]!ǡ3,[DSSSQQW&===˗qD>=C1"䵲DdqHL n?ycncYobȣTGv"VNt[)pAyJn_0XX%G3 1r ,B>1c&+B S!;7MfwD<곳b0o˻_$Q&"9!+1,C]7e&'g!Dɍ9"SPJ1MIQQX%RX$PeRX gx!#Y%oBMVEe&q́τS&yy,VĩJ1l>#&'a!7 ]-.}Y}VJ}bY@uzHG:6{ܗ8xHB_gPo1r/g}6J 0!_oѷXvoX4 .Avvv"CYH(6JW~,E Q(9r9 i `a,0E(FD4rbXWWHߞᮩLR" $U7]6024vJ,G3ºՔ=M]WFq̈́u 'umD5 31ևpoAXV7߮r_Y~ɔ/=rݔ?n)W˿uW_Ѡ(Wcn糩.?8*$IJ4;*qAr$=Sɏ;'cI_-wg KtQlXTu~?yDT<,iQ7֧weTN29ދz49}7-gy%{t"ku`|6IHĤp<xiW-Xqň^tuDT,R(ݙŮbPU<¯0h/~5tĸbx~ळ,FDb4nH U=tͤJ8T4ry,9[Ӵk*.v½#fAD&QqŊDgӈHDŮuJgݴz:θW}3KwF(Ƣ6/F?3@Ler)R̡y//K\X]OMTSMc7#- -&7ң*CdfʃRgʒfFE!AYxMzlEX]ٳoj&7_o-===jl@ق07{ fd|"غBkjZRpaiED3jZ0Sj;-OGDYXzߞ/ׯGEbǓנSPy-Z3?w}\YkÏiZ|KeX>x-nʚ' DD o ]=XIq$tw6Rʈ%uMI86%o}_ao{{uz"QL+{n {^~Yj IDAT׺dn}%E:{#GoH\(JBb "%w^]OFDDҌGNTo`D$]ҨZ|K"ؠ?p]mu^u),uoo#V[xf|-o`ЊXde~rEm˶򢧒*5k䇞htZZ(8:+VƗ_~YJKeFyЇ0p6T0 ԉtj4$>pi|Eô'ָ؁kn*]B7_Vi̯B36[_A[Y9fyF&/7JXQ&3f$8-+LcMs@DA5TW7 DkZ_7s"ٳ>P@"@JW,#իWr-? ةN:ƍn喍7^@)Sղ%.g}&Iy(pRACVG9[o_c}ooik*xup*1A`F  Ҙo_4B9X3Ԩ%FPlĵgb1')v};(0Gy護vz6"_Oh͟GG>5/KψQͅ]D|޿\zuͽ~YϾwohmnX;vߢ0YaىfI,ms6TFDtg  ڰ\uuх (6i3o߬w'~䥩L~ȟݯ9e7^ϺoR/[o^S3zٝtnꥦks6gP29rUBuɐX2~ЧGbzUV9Ad D(6~[x5mKoW];{ENŞ_nX`jX,q9*bƾPX^M}?NY]:pbb݌1' rLpjñbOW9}쭡Z}򧮮_.ZrO@5()tY_!*|Ց=191Ƀ}P zFJjC!D:Eח>-Tb5_9й罺b _ݳZ3=#TvV I*9;aq/1%2}B9h{c w-Wl1w ¢Y~$9ގTl8iM÷~6U L w,. B$>DYϤ?8B†.2^- (V|O[]i{+N~q AU,rKW/]jym~/b ϿϺvok[U,q׿^C9 1Mfẅ́ɵDXl{64 <5ó"}1ާw|3kbfꦣf3氙i-fK;s)9!/iXAN=pbqo.NM^hZݻK9#pޞ6:qSf Ql)uO\w4EQ._;<1APhyjASٴlF' Bu8#&Hp6ώGd& tF7Lg j;O"r9_(pR"zT-e}uݕ,  )q3U$1QfX`W<(˲ԫr,Nz&0"iMFzR)Od,ar=;&' KÿyM֍ĈD5+yL:2,Ij,f\ηLI)MEƤHqxDD)MoKijOCeJaDwۚ(FMr-N֣sKznoؾ;X2/ jB6lT-ẌpRs.&cћx!ȧ4EbLzfQ"d#ssYuaI&G}w8|!9{f&s onm45 \jOw|0bfӖҎxΌdKy<FV^,li6 tF,rʷ͵2"&kIeӹh~0( MEO jQ :RkCéT#O<2^pX"&3"QSnx"*<ϟrv(hXri1]Tqpo!ta0I)9IԐo#_=ݐFNŎ木>G$"z"32g yg2"G5M  d#nBDD;X'1IPOdlHZpB@{0洛o҂ Yg2:(s$Z@Wilumh}c( ÈffcСj;cnζ-ˌ]< ?2߲arn-g Į!QlEJP^>b/|L?MFdFT=OM؞ಹcHD=`݋;Z1NFogsCLs}"~@8fyG<<hcq(v:p@CbqocݖmSDxoF\tY$*=2-mF|o'܀$X~|&x{!9pJr'29̬HDw|e1p/6ӥb&T!a8wg/8Ɋg:%*ZOR/Q>t <\sfJ{|z;>fFD.CkǩgCf3:OB:UJdƋ9s^~}EEŨιs~,\M,7#p2"twqvtXIf6Su?$Q ,EIrTDtڐHDf:+Ǝz)ocLaes4Q[biXȝnɩɌ'u7J< 1Mfv\eyHܵ-W4U,;|P<%#Zߺ:::{ݳgvDDwyEU{>;TZv?-GC3],;]н8sۼB+wՇ߻>T;Chr믿~ ;N髬iX|w`)7lmaX|rg]_--VZj +Z[̙e˖~{۶m~;u?cv={_߿/z/Y-/Qʬa}y݂^eoocV%w vjGoTm+NK['~7]LOdpmDDɿ?tܻ N?[ܻj+}3eeV4ND89ba5g>Q>6Di uWZ>/<3C[ox|k!/޵mݽSG{I5{޵Vo:*+к}+-g "ڿ7/XO놦]m5_W[u2?v=5H+ٙcϬ'YS]pdst}7[.HМ P.19jܢN㯶RJ=/od_ÂUJ?2KJoc$ԉ<.ϛ|Υh.]yo*sv~"kDc+m[_ɇoE{ſ6B[Vu~``?9;"uowT>'m wN+>Z?ʹޙ۷hXA;DXA^R15rD"^x&B"MpTDDϭh5G~xx;/UuNvBYY9 mQDxĞ%(2DN}~mcO?aUzz;1VU Cf1p7|JiV6lk~{]D:N82',K Qvtt.Ӟs":s_R&3Ʈ#2Ve`4Q+XQ{ɬ4LbC'z|'>oi-3  }~i2#"MYqs"P:e!dNX';vQM<|+RrĩDDm:s ;|gu& 3}?~=ybŊg LUU@8ퟹwp"UJ FVٵG~?O^1WڵG;M< XW)M߷oͿ>>S}k'WDi"5Z_Ms/9O_sEy?mx|Ż"緶ugME$ľ3w*;*N~̈*-jCDmtsu]vN} e{𒁱7ߙ;+:>:{eDDm:XѦ/ύߙPwf޺k׮ֶʚs7^$pztvvnK::::;;8QsҝT*%]G/ѺVoC|d|0LV^`_j|D4!e*++Jw*++Kw****+++zL0t;a\`  ?'U$^1D1D1@@D1D1@@D1D1@@D1D1U9s|pjb 58W Q `D 74cΜkjb0+Ţ xk_uXpvi]]]D4.Q؍BoũG*~ߜ]-TOlh4V5C!S.kR# FQ +VRs""ּeۯvlٛ!E(h9X,vuu PS3Ϩ>K"؁DD'"j>8_EZlz,ؚyjML9NG EZ#wϠאڸƘ- լa~iH:k诎ge]o~xS IDAT&#ADmz-?_2VϨm#o>oR޶~SFT3#\GDf\P(o=8SwQjX\C[8Dp;X1鴉bKʪӈY>,YRbsOgg'D1/4oo_~?zF-C{c̛ΖJ=Z봯U$*}+bWWw!^ve]v]zb)+v;F-xODJ Qfg /XHCDm[xnٹ3ZRr>2nǹ+N>ϧG[4tefhK,~04h(6&1 j=K F-~+bs#Wν5Bf}x[״ĒK>ʕ kՍ=i m^ڈXjp/2+b'1%D/,~L}h;/l~]/lLdDbt]^K'̔3^~{}K㶶Vs_iݪ3^g`}Wb]?!yі޿凿uF}̈ZVDT[/2"}bx*gqwdl鵛$QcBM?t8;[Dw _;S fYQ9t}iDѺED\$}?fƼ%e֣.Jct+bnk0beB,}  M2[{ה۶Zt. FD5 6pJ<'"?#4b2 ~PhYTK)cjE)$8dQl)~򽝽{v.ӹjm~ 0lwTl=ɮmڥem+=8wC]<#&'&Dc{J,=bcosLKTKh,qDkxDĝnɩ62Iԓ8yXP ʉb"c.S z$ '&J%"{J$wDrkM40ڮ JoxNĭiWٵzTa t6dNBψ.bqqCWVc T_EՂ 3maǐA&if8O$R8GtٖS%Z7\zѥ>DEZ}1@Y-@7ysV&)ŢTG͇R^uhͽYR*nJ4c15?Z80IzcHb(vҮ45O8rAcM{s%jDuuGNߜ^,ڰƚuڃQ=| 2TFd-l:u&i4Tɚz6IݹϦR2+H 'ryKP!Uל)qJkriJ#Lq|Ї}iw4GW> bmJi,@cD!̯"=VI.^#?4L| a pLvG gpLaD5˽dnTYE%bX.G͇RꈚiKI+};y5q]dE2۶:xfo.'I}oi%ꍧꭏEbG-8căMQ=HdㆣYNqVǦlγD?hZRɧTckOWyi&9Mr]~4h9Qm ͥ~#,ذ]Z߽mi}; x[g91LM9dQ`F^7,X(0"ID\윏{Ϛ\ r5/$be+%Zs@2 Z6YXT ^֗0v~H" Zʉ1*4y8 GTR5b," kHܵM;;A5e4gNb4px̌YZ<&OD~.vXTF͏RX] DLL9XJ)7EQ_>, f}pڅAr$zp Fʅ2/2d;9) 64;M8GT2ʦe3:I1DD"?{[^nki})Bp%ȺeA93WP!ԼW}:l1dAI%qգt[yҝR)>w4$Z`tNڄ$M6͚55GQщ8oXz2/uф˖-*ݩ,ݩ1a„ +0jMܥu\ +54bpشib0 A90p3vPbH;+(0 J8qppuuPs|<Q ֱhQs2@17LNqMDЇ(G sXމ\A`rX8&E39P0}ʑt'QR杻w޽s;fۢ cH?Q{q9fCbj8,$*1A`ʤDz1C˝"eLL 0Qe7CkgQEdo'>?CNPQkg;?nмWI= b'I%Id%%P;pvLATcҚDgKkYD1jg&|Z]Y͢~׿2%n&$2&1f)2AM{.$Ig2siYLl<" h6#L!;(k=/TT{Kkұ"wZ׳A䓔֌ÊTvߊi)nnJ Cz4Iq'(lϤxoC@:^F,)v7%p$(#$OI Nڕk6[WŁ~&vۋŽ9#HFcO$fZ)%95ۼ&4os;F%+41YjX1Vh-1r$">Lۈ=6nrqJl)HDoʥf -mTYR"gDƖ;ik)CeDL֒:˦sQ'`LC*yϻ{z6IJɔ0"1-M`(55gJܲҚ\x5(|yTO͍fL](-/Nj2#U=L. "0i$aɩgrP8a6َ?DT^6XGLxDDbL&S#3cLJ/ ImtwQ(K6U ^w@ K݋]?b8G#U3b80" r Éx.SeIa~jWQs"1r#i~Qk#"os֐c& 5T๭LD"0洛oC> )=g1I)y7Vi;϶tg&﵉jdh5=e_U楙4umT3l*"Qkޣ Ԉ6҆d@1"Y؇Pz:b\?=f7 ~]6عsΝ=;&43nJG΄Y\nN~&v2Q( #Ccc,Nah"xvnF gM.lfb2 G9 A8jmnҹĮ 0lwTl=qlGg HR%6TrpUUdntYVPXn=lԶn;rB* DRU.1ϧѴvrҾd}7DQe}kaȱ<~!|z/\oMej>`Q |}=7oTsw7S )7]N*ьGDIꖜa#M=p"b ^ R9VnmEhkX&߭]ej,LT&vb&lj') S&3fZ",wmNd --erM͍lyP3C33gEghL>tڕcQy RX] DLL9X͏]yޔ)SL9oZ0s'aYٵzTa t6dNBψ.bqg7wO@)*Tt ~Yː} e2w.w&f1h F3R"13tX FR-4KNe>SٴlF' Bu8#&HguoMs-/0hKuˊq s"f6>󪲳Og ;,=&QU'Gggg鶤;(IRU}?|hzi x|}rfXNFPy!gnJQ?Ɠ ѦMp\F55^WVUu0S/N>r"`ٲeUUUUUU;;=&LP0az+j:ePCbqg7,wNbíV$Ni"EgÝw7DDžK`ր( ȷKyBsZ׳A䓔֌,ذ>%V|k<*5,LnLmmiTdIbZ8Q%w=R]Cge[0N2؈nX~}N<ƆtbmyZ>Q"syS#7gW.PcObik)CeDL֒:˦sb#-hQмeiw;߾ywyD$iW{ZwkKS% 1(6aL:F$ k: "ySIi5COu(ڱ]7=u&e:]|ՑX#ԫIhH Ob1oJ߱Z;p¸b7W1|Λ"yH$7Zj>>;a4*#o~c6:_VotF}"O0Hb> b#[7'Te1v=VwQl7t/cIk`_?u=aaȚWa+{Lc]#gbBD5m[KÖ-w?}o:|ѡc W˗sXOꥧFOFQz^ȏ>Y?ޛ&r2;k֎i)y' n!Sz&ޯfUW >{/*|f5@l ʷ~Jy+ 5ٝٽg-} k=d{wIsɏ?JEI '?">Y88>=-r%S)$vIRG,ϳ|Ii?:^|6O .܁aFĝ^yFݳ{ܿhAL…\.'?OHL֭A ߏ2n ؎^iZ#k:s'{Z7#H[[[[[[D…H$Z>U `  <6 (ꠃQ `q <6fh;bطnD8sA ( 1dJlA)!kyD-N8iE$IX$jD!glx9)KK§ݢ1T\a^_ϕ}6 Ӳwuo?{ݻ (f IDAT&3Ib\sv9bZʈe'XHcR+eN@$B L 9jk OfCS4#rn,WnxFp[dI5MC9ID)/gM;UXaIK/*RAG{cT,W2Bb$u<[qBQshW4zzaEY,m ݪ(J=X 6Ɗ)YXӛ]v&v2_E5dIP5;vfK߼_ FYuj""[b%M+(``lA5G`?@Ps:(vA)0w2<,AU Q `qnͱPs5@X3\:gj-G{N0Ŷ ΈOnD͑+dh]M^ k+'Q"NFON}"b}rk'6ID+l$~rG?{?qJRgK:'G~@o|zwyg6-R#bMY1jSjoCc.ȥh`=O.3Y.bu-H_1׎QM8@"с۝Ui$޳oQFc'}z_ֻn[>s`,~p񷒝7_VS(K Xс>FMq72cѾ[:7үӾdS'ǜfȱʞ)1E6ַ5Gͷ!wPdh{,V|q{z.OQF$ku?꓉H#o~hrA}+EuQl=1ډrߕ7D[EcQG )rwe|Now|v 7xVW>ӚIڥ͵Ο|9%Zn#76S}Jүλ*d}"s&cD;`]*Ps|Ux$SGU"~"a|e.G;+owF7fa|bGya`:O)%[Ν6\r[rFD/~ Vfkra[rVu5g$IBbK7jr; oh's͠7Q9Kؕdie|`V11t6q3ז=!j{R*ѨΏO]Wk+=] !"p'HؓZׄ;:^GT}n8ڌh.=*mv n<*YӸ_"8v~9jk.\~ޝg. akط-ZS0@U+d5{dn9c3(֓:й1Aؗ-KhovrMHo[{2KO. 8l_앋e7o؍LDSG2X)w"2gb@QsXCxndGܙ1x|ova[?y}<sГ b/ZS|gcp;Xebo]vgԱ[x2FۣG_]OO9%"AwԱ[;7TCEeC#k\nD?F~^GupJRH쒤XgK8P9j.}O\PQ/5 nLL|3{p?AL…\.'?Ou_z؆sCF/鑡f]t nj1ozi8OƟ"jqD"mmmmmmB$ Z[[#HkEKKK҂+(5Q>.w.Kѣq\ u 55{;ECr;7=ݽoS*&Y45#w#񃙏G2xkhppE@QsD1[s.j9l"@yv\%bb;+@@@GJD1(A)!ky@|I+jVJ+$82vґ6uYp0.I$DI=] 64b $I⪞)(*غd,V9˭ Uډr5#( MfĸV̿P-߽կ{k,[h{~HdXξ{WnEPN ;Q,z+,l0à Ӌn)QGbvZR0;8Y2T a鸢(\r _15S9۵qUSFh MMϤ{']{;(PعDR .WB&,ky,$iTc$1z.1"$+edu?p,W5NĈM,ٴ+jnŋV:c "ɔ0""̼ӕ0T|Ś !5cNrل™ZV>]7cZStDAo ZB%O%d"5M7ݜ*I{8GŚk죲a/jK\ Cp jj(F @ݵl$&+R~jJv0&/lRSowRq D╈QX}/Tb |B]zreUSTMS<\3 hYbsMc\Q]74&M7i/Wƀ 15N&v&̵ ȷ-O5b 7j){) O-i{gI'P6St+d, .-2X)w"2gbnVue9dƲu& rg#"%d[2n=bE Gn˕>wړ(Ys唺{=yvݻaB,W2Bb$u<[ˆ&R3ϦlR>[(VVN?g<~[ʪ(.@H*ffft055.r9'ЈAe^iZ#k:s'{Z7#H[[[[[[D…H$Z>U `  U%c\ o?~Sw1̓ (P ^8buPȪLfѓi =Sl2غrXQ1jz]>Žg\v=.Y[?~R2""1V9}D&ga3ݙGTc1yΔ"\_a=t=}پeD\7>=ƛţi7O?"Xs}害?n,s삃  6ߺ#xַa|唣]vVO`kaWwU_:| D╈詷+opSΕ#g$棗G՛h|sB{s]<ڭE EQO@X64r~aD=9)F &Q5e2ÂWG n1t6q\+x'?lub =Mh@[f׬{/$D\yr'".ݘA,}}jUn-eR6D-d B0YՕK_G :3s:k^ċ(ϝE*+JkN62Q~M۽gϞ×Ι{ϞaB,W2Bb$u<[cDxD_+~tl,yw\JkN{4J}t-ۍ4bff&| MOOLMMMOOOOO SSSB.?_|c _m[q;G788?ʸ5&0`;zi8OƟ"jqD"mmmmmmB$ Z[[#HkEKKK҂V1M((`^1P(l 6JD1guPb :*( J8ɢYp(D1d$ZxT@PD$"IR/׭xyJgld1D'q?6lgx*!W{n? &sNXxƈ}یˌOƻYfzД*謜'˕1ؖ&dԨKbBh$IˉU08ErAXHx?dDCIW\,],1gBXd*.IysZ'9h~_ޭb333chzzzfffjjjzzzzz:\ rw?zQGh n2lqW^459=@D-ΛH-\D"Bkkk$ihii [ZZ*ivJippE@MuA (J́pOxdh@yp%l[0EsߏkQY<:Ql rvd-bOL@@X Mfĸ젱rN׸$IS⩢I@{|7szyG{B/WO_e^hu &Nw'.V^a>gi='KldS`D|v[7'[Iࡖ(gFZww_=qέ[M^ k+'c/߿hoO6eZBϥ4F=d%Sf8SF#@9i'`}9'h]W݀T`}z_ְֻfDԾ(#Ҟ?3.Hnh@Dn [0%W^de\px3D/R#]z/tPϲ'zd3yJNo "v>'E""bc$@4BP$^'tqӣ9Q|"]`AW{ `3JAs a떝O(% =;]aX\;A.h""YӸ_\(Ը %lL bM$I-ըrKD 髢ƽ&;58(?i)y' n!Sz&IVreWJ)ݔ弮u,WKDڭ,NfB' ̝o\.׮=tuzx˩#n˕L%I<ϖqN䗲?ΔVcp1FR 泺WEi;6r{JP `IDATDD>Qő̂qF`uYeDS'YX +$Jlٺ3A񆱪6BkiuSX9eՆUXJ5mdž1Uj2Ғ٩.GEhQe헋7&T~99S ;;ž%[u]<@%~_V1a巓3-Wx`;y饗nܸ˭|'e=PVU[2~ΆA/6["ľ'Ͽmm_nm_Raܰk3hjOr$[Z 7dK/VI{\{]_2q-;lq V1h_'7o~dIϳ*F>-wdUɈ68o]_T頬tb@&m]<T_NOO~ō1Q{bRZ[[k[[[#Hu9Tr˫Ym+(g+vc؄ybE)6-Vf8U&~1aaోb0~ITU dWU_8;-bi,T]0Zt;VaվjXD1xV; O썏-(\.6֎<]+

{{.Name}}{{.Ins}} {{with .Doc}}

{{.}}

{{end}} {{with .Methods}}

methods: {{range .}} {{.}}  {{end}}

{{end}} {{with .Examples}}

examples: {{range .}} [{{.}}] {{end}}

{{end}}

{{end}} {{.Include "head.html"}} {{.Include "header.html"}}

Syntax

The mumax3 input syntax is a subset of Go's syntax, somewhat similar to C. It is case-independent however, so msat is the same as Msat or MSAT.

Defining variables

New variables are declared using :=. Variables have a fixed type, inferred from the declaration's right-hand-side. Assigning to existing variables is done using =. E.g.:
i := 7         // defines a new variable i, type automatically detected to be int
print(i)       // now we can use i
i = 5          // assign new value, don't use ':=' (attempt to re-declare)

str := "hello" // defines str, type automatically is string
//str = 1      // would fail, cannot assign int to string

Arithmetic

Most common arithmetic operations are possible. Also Go's math library and some common constants are available. For raise-to-the-power, pow(x,y) should be used.
x := pi*(3+4)/5
x = pow(x, 3)
x++
y := abs(cbrt(cosh(erf(erfc(gamma(J0(Y0(2))))))))

Control structures

Loops are possible as well:
for i:=0; i<10; i++{
	 print(i)
}

Implicit functions

Some of the API features accept a function as argument (e.g.: RunWhile(func()bool), or all input parameters). In that case, and only in this case, the argument is implicitly converted to a function, which is re-evaluated each time it's needed. E.g.:
value := sin(pi*t)  // value is a float64, RHS evaluated only once
Msat = value        // time-independent Msat
versus:
Msat = sin(pi*t)    // RHS converted to function, re-evaluted every time

Methods

Some of the API instances have methods defined on them. You can call methods on an instance by using '.' as in most object oriented programming languages. E.g.: a material parameter such as Msat has the method SetRegion(int, float) to set the value of the material parameter in a certain region:
Msat.SetRegion(1, 800e3) // Set Msat=520e3 in region 1 

Mesh size and geometry

The simulation mesh defines the size of the box around your magnet. It should be set at the beginning of the script. The number of cells should preferably be powers of two, or at least have small prime factors (2,3,5,7). E.g.:
Nx := 128
Ny := 64
Nz := 2
sizeX := 500e-9
sizeY := 250e-9
sizeZ := 10e-9
SetGridSize(Nx, Ny, Nz)
SetCellSize(sizeX/Nx, sizeY/Ny, sizeZ/Nz)

Periodic boundary conditions

Optionally, periodic boundary conditions can be enabled:
SetPBC(5, 0, 0)        // 5 extra images on left and right sides.
SetGridSize(128, 64, 1)
SetCellSize(5e-9, 5e-9, 5e-9)
Setting a nonzero PBC value in a direction enables wrap-around in that direction. The precise value passed determines how many repetitions are seen by the demag field. E.g., in the above example the demag field behaves as if 5 repetitions are present to the left and to the right side. Choosing a large number may cause long initialization time.

Resizing the mesh

The mesh can be changed at any later time in the simulation. This will cause the magnetization to be stretched onto the new mesh if needed, and the geometry and regions to be re-calculated. After resize some cells which had zero magnetization may now fall inside the magnet geometry, they will be initialized to random magnetization.

Setting the geometry

Optionally a magnet Shape other than the full simulation box can be specified. In order to set the geometry, you first need to define a shape.
geometryShape := cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0)
SetGeom(geometryShape)

{{range .FilterName "setgeom" "setgridsize" "setcellsize" "setpbc" "setmesh"}} {{template "entry" .}} {{end}} {{range .FilterName "edgesmooth"}} {{template "entry" .}} {{end}}

Shapes

A shape is an abstract object which outlines an area in a 3D universe. Shapes are useful for different tasks, e.g.: to define the geometry of a magnet, to define material regions, or to set locally a specific initial magnetization configuration. One can specify primitive shapes, constructed at the origin (box center), and translate/rotate them if needed. All positions are specified in meters and the origin lies in the center of the simulation box. E.g.:
myShape := cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0)
anotherShape := Circle(400e-9).sub(Circle(200e-9))

{{range .FilterReturn "Shape"}} {{template "entry" .}} {{end}}

Material regions

Optionally, up to 256 material regions can be defined. Since each cell is made from one material, it is associated with exactly one region. So regions can not overlap. Each cell is assigned material region 0 by default. It's a good idea to output regions to verify whether each cell is assigned to the intended region. Each region can have its own material parameters, and we can output averages over each region. E.g.:
DefRegion(1, circle(1e-6))
DefRegion(0, circle(1e-6).Inverse()) // redundant
save(regions)
Msat.SetRegion(1, 800e6)
tableAdd(m.Region(1))    // add average m over region 1 to table

{{range .FilterName "DefRegion" "DefRegionCell" "ReDefRegion" "regions"}} {{template "entry" .}} {{end}}

Initial magnetization

The initial magnetization is set by assigning a Config to m, setting it in separate regions, or by loading a file directly.
m = uniform(1, 0, 0)
m.SetRegion(1, vortex(1, 1))
m.LoadFile("config.ovf")
m.SetInShape(circle(50e-9), uniform(0,0,1))

{{range .FilterName "m"}} {{template "entry" .}} {{end}} {{range .FilterReturn "Config"}} {{template "entry" .}} {{end}}

Material parameters

Assigning to a material parameter sets a value in all regions. E.g.:
Msat  = 800e3
AnisU = vector(1, 0, 0)
When regions are defined, they can also be set region-wise:
Msat.SetRegion(0, 800e3)
Msat.SetRegion(1, 540e3)
Material parameters can be functions of time as well. E.g.:
f := 500e6
Ku1 = 500 * sin(2*pi*f*t)

{{range .FilterType "*engine.RegionwiseScalar" "*engine.RegionwiseVector"}} {{template "entry" .}} {{end}}

Excitation

Field or current excitations can be set in the same way as material parameters:
B_ext = vector(0.01, 1e-6*sin(2*pi*f*t), 0)
B_ext.SetRegion(1, vector(0, 0, 0.1))
Additionally, an arbitrary number of time- and space-dependent vector fields of the form g(x,y,z) * f(t) may be added. (E.g., to simulate the field of an antenna or an arbitrary current running through the magnet)
B_ext.Add(LoadFile("antenna.ovf"), sin(2*pi*f*t))
J.Add(LoadFile("current.ovf"), 1)
Excitations can be defined using standard mathematical functions, or loaded from a .csv file with FunctionFromDatafile.
{{range .FilterType "*engine.Excitation"}} {{template "entry" .}} {{end}}
{{range .FilterName "FunctionFromDatafile"}} {{template "entry" .}} {{end}}

Spin currents

The effect of spin-polarized currents on the magnetization dynamics can be modelled in different ways. In mumax3 you can use the Zhang-Li model or the Slonczewski model. For both models, a spin-polarized current field needs to be defined. This is done by setting the current density field J and the polarization Pol.

Zhang-Li model

When using the the Zhang-Li model, it is possible to set the non-adiabaticity through the material parameter xi:
J = vector(1e12, 0, 0)
Pol = 1
xi = 0.1

Slonczewski model

To use the Slonczewski model, you need to define the magnetization configuration of the fixed layer. This fixed layer can be placed above or below the sample. The Slonczewski parameter and the prefactor of the secondary spin transfer torque term of the Slonczewski model can be set through the material parameters Lambda and EpsilonPrime respectively:
DisableZhangLiTorque = true
J = vector(1e12, 0, 0)
Pol = 0.6
FixedLayer = vector(1,0,0)
FixedLayerPosition = FIXEDLAYER_TOP
EpsilonPrime = 0.02
Lambda = 1

{{range .FilterName "epsilonprime" "Lambda" "Pol" "xi" "J" "FreeLayerThickness" "fixedlayer" "fixedlayerposition" "fixedlayer_top" "fixedlayer_bottom" "DisableSlonczewskiTorque" "DisableZhangLiTorque" }} {{template "entry" .}} {{end}}

Magnetic Force Microscopy

mumax3 has built-in generation of MFM images from a 2D magnetization. The MFM tip lift can be freely chosen. By default, the tip magnetization is modeled as a point monopole at the apex. This is sufficient for most situations. Nevertheless, it is also possible to model partially magnetized tips by setting MFMDipole to the magnetized portion of the tip, in meters. E.g., if only the first 20nm of the tip is (vertically) magnetized, set MFMDipole=20e-9.


{{range .FilterPrefix "MFM"}} {{template "entry" .}} {{end}}

Output quantities

The quantities listed below can be output.
Also, derived quantities can be produced: the quantity restricted to a certain region or a single component. E.g.:
m           // magnetization quantity
m.Comp(0)   // x-component
m.Region(1) // magnetization in region 1 (0 elsewhere)

Averaging behavior

.Average() yields the average over the entire simulation grid, except for m which is always averaged over the geometry.
For vector quantities, an average over the magnet geometry can still be obtained with the .Comp() method. E.g.:
B_demag.Average()          // Average vector over entire simulation grid
B_demag.Comp(1).Average()  // Average y-component over geometry
m.Average()                // Average magnetization over geometry

{{range .FilterType "engine.ScalarField" "engine.VectorField" "*engine.geom" "*engine.thermField" "*engine.ScalarValue" "*engine.VectorValue" "*engine.magnetization"}} {{template "entry" .}} {{end}}

Slicing and dicing output

To save storage space, it's possible to save only the part of the output we're interested in. This works on all output quantities (not only m).
save(m)                         // save full magnetization
save(m.Comp(0))                 // save only x-component
save(CropLayer(m, 13))          // save only layer 13
save(CropLayer(m.Comp(0), 13))  // save only x-component of layer 13
Or even:
mx   := m.Comp(0)
mx13 := CropLayer(mx, 13) 
save(mx13)
tableAdd(mx13)

{{range .FilterName "Crop" "CropX" "CropY" "CropZ" "CropLayer" "CropRegion"}} {{template "entry" .}} {{end}}

Scheduling output

All input and output quantities (as described above) can be saved in a space-dependent way (.ovf file), or as spatial averages (table output). The data table (table.txt) contains by default the time and average magnetization. More columns can be added with TableAdd().
save(B_ext)

tableadd(B_ext)
tablesave()
Optionally, the output/averaging can be done over a single region:
save(m.Region(1))
TableAdd(m.Region(1)) 
User-defined variables can be added to the table with TableAddVar().
myField := 0.42
TableAddVar(myField, "B_extra", "T")
myField = ...

{{range .FilterName "dump" "tableadd" "tableaddvar" "tablesave" "tableautosave" "save" "saveas" "autosave" "snapshot" "snapshotas" "snapshotformat" "autosnapshot" "filenameformat" "outputformat" "ovf1_text" "ovf1_binary" "ovf2_text" "ovf2_binary" "TablePrint" "FPrintln" "Sprint" "Sprintf" "Print" "Flush"}} {{template "entry" .}} {{end}}

Running

Run(time) runs the simulation for a given time in seconds, using sensible error settings.
Run(1e-9)
More fine-grained control is provided by RunWhile(condition), which runs as long as an arbitrary condition is met. E.g.:
mx := m.comp(0)
RunWhile(mx.average() < 0)   // search for switching field during reversal
Optionally, the solver accuracy may be fine-tuned. E.g.:
MaxDt = 1e-12
MinDt = 1e-15
MaxErr = 1e-6
Optionally, a different solver may be chosen (at any point) with SetSolver(int). Currently available solver types:
  • 6: RK56 (Fehlberg) solver. This is the highest order solver available, but which is typically not faster than the RK45 solver.
  • 5: RK45 (Dormand-Prince) solver (the default). An accurate solver, very fast for magnetization dynamics at the cost of some memory usage.
  • 4: Classical 4th-order Runge-Kutta method. Intended for simulations where a fixed, relatively large time step is desired.
  • 3: RK23 (Bogacki-Shampine) solver. A robust and reasonably fast solver with low memory requirements. Typically outperforms RK45 when relaxing the magnetization with little dynamics, so it used internally by Relax().
  • 2: Adaptive Heun solver. Robust and uses very little memory but takes smaller time steps than the higher-order solvers. Also suited when a fixed, relatively small time step is desired.
  • 1: Euler solver (requires FixDt = ..., ignores other settings). Only useful in exceptional situations or for debugging.
E.g.:
SetSolver(2) // Heun
FixDt = 1e-15

Relax

Relax() tries to evolve the magnetization as closely as possible to the minimum energy state. This function assumes all excitations have been turned off (temperature, electrical current, time-dependent magnetic fields). During relax precession is disabled and the time t does not increase. There is no need to set high damping.

In general it is difficult to be sure the minimum energy state has been truly reached. Hence, relax may occasionally return after the energy has reached a local minimum, a saddle point, or a rather flat valley in the energy landscape.

Minimize

Minimize() is like Relax, but uses the conjugate gradient method to find the energy minimum. It is usually much faster than Relax, but is a bit less robust against divergence. E.g., a random starting configuration can be Relaxed, but may fail with Minimize. Minimize is very well suited for hysteresis calculations, where we are never far away from the ground state.


{{range .FilterName "run" "steps" "runwhile" "relax" "minimize"}} {{template "entry" .}} {{end}} {{range .FilterName "t" "dt" "MinDt" "MaxDt" "FixDt" "HeadRoom" "MaxErr" "step" "NEval" "peakErr" "lastErr" "minimizerstop" "minimizersamples" "relaxtorquethreshold"}} {{template "entry" .}} {{end}} {{range .FilterName "SetSolver"}} {{template "entry" . }} {{end}}

Moving simulation window

mumax3 can automatically shift the magnetization so that the simulation "window" stays centered on a region of interest. Shifting is done to keep a freely chosen magnetization component nearly zero. E.g.
ext_centerwall(0)
ext_rmSurfaceCharge(0, -1, 1)
TableAdd(TotalShift)
will try to keep mx (component 0, counting from 0) close to zero. If desired, one can override which "new" magnetization is inserted from the sides by setting ShiftMagL and ShiftMagR, though the default behaviour is usually OK.
{{range .FilterName "shift" "totalshift"}} {{template "entry" .}} {{end}} {{range .FilterName "ext_centerwall" "ext_rmSurfaceCharge" "ext_centerbubble" "ext_centerwallinlayer" "ext_centerwallinregion"}} {{template "entry" .}} {{end}}
{{range .FilterName "shiftgeom" "shiftm" "shiftregions" "shiftmagl" "shiftmagr" "shiftmagd" "shiftmagu" "edgecarryshift"}} {{template "entry" .}} {{end}}

Extensions

Extensions are extra functionalities that are not officially supported. They are aimed at rather specific problems and may not work as expected for your particular situation. Their API and functionality may change in future releases.
{{range .FilterPrefix "ext_"}} {{template "entry" .}} {{end}}

Custom quantities

Using existing quantities, it is possible to define new custom quantities. E.g.: instead of using the pre-defined ext_topologicalchargedensity quantity, it is possible to define this quantity yourselves inside an input script:
cs := 1e-9
setcellsize(cs,cs,cs)
setgridsize(64,64,1)

// Use central finite differences to approximate the spatial derivatives of m
mL := Shifted(m,-1,0,0) // shift left
mR := Shifted(m,1,0,0)  // shift right
mD := Shifted(m,0,-1,0) // shift up
mU := Shifted(m,0,1,0)  // shift down
dmdx := Mul( Const(1/(2*cs)), Madd(mR,mL,1,-1) )
dmdy := Mul( Const(1/(2*cs)), Madd(mU,mD,1,-1) ) 

// Define the topological charge density
chargeDensity := Mul( Const(1/(4*pi)), Dot(m, Cross(dmdx,dmdy)))

// Save the topological charge density of a skyrmion
m = neelskyrmion(1,-1)
saveas(chargeDensity, "chargeDensity.ovf")

{{range .FilterName "Add" "Const" "ConstVector" "Cross" "Div" "Dot" "MAdd" "Masked" "Mul" "MulMV" "Normalized" "Shifted" "RunningAverage" "Sum" "SumVector"}} {{template "entry" .}} {{end}}

Custom effective field terms

It is possible to define additional effective field terms by promoting a custom quantity to an effective field term. The corresponding energy density term can also be added by promoting a custom quantity. E.g.: instead of using the existing anistropy field in mumax3, you could define the uniaxial anisotropy field (and the corresponding energy density) yourselves:

Ms := 1100e3
K  := 0.5e6
u  := ConstVector(1, 0, 0)
anisField := Mul( Const(2*K/Ms)  , Mul( Dot(u, m), u))
anisEdens := Mul( Const(-0.5*Ms) , Dot( anisField, m))

AddFieldTerm(anisField) // promote anisField to an effective field term
AddEdensTerm(anisEdens) // promote anisEdens to an energy density term

tableAdd(E_custom)  // Add a column with the energy related to the custom field

{{range .FilterName "AddFieldTerm" "AddEdensTerm" "RemoveCustomFields" "RemoveCustomEnergies" "B_custom" "E_custom" "Edens_custom" }} {{template "entry" .}} {{end}}

Math

Mathematical functions from the Go standard library.
{{range .FilterReturn "float64" "int" "bool"}} {{template "entry" .}} {{end}} {{range .FilterName "pi" "inf"}} {{template "entry" .}} {{end}} {{range .FilterName "true" "false"}} {{template "entry" .}} {{end}}

Misc

Other available functions.
{{range .FilterLeftovers}} {{template "entry" .}} {{end}}
{{range .All }} {{template "entry" .}} {{end}}
3-3.11.1/doc/templates/download-template.html000066400000000000000000000164441503346766200210350ustar00rootroot00000000000000 {{.Include "head.html"}} {{.Include "header.html"}}

Prerequisites

To run mumax3.11, you need:
  • An NVIDIA GPU with at least a compute capability 5.0.
  • An up-to-date NVIDIA driver (compatible driver versions are listed below).
  • Optional: gnuplot for plots in the web GUI.
To check your installed driver version and GPU compute capability, run the following command in your terminal:
nvidia-smi --query-gpu="driver_version,compute_cap" --format="csv"

Download and installation

Select the platform, NVIDIA driver and GPU compute capability (CC) for which you want to download mumax3.
Note: the CUDA versions shown in parentheses are informative. If you can not run the mumax3 executable, check if your GPU is supported by that CUDA version and downgrade mumax3 if necessary.

After downloading and unpacking the archive, you will have a mumax3 executable which is ready to be used. Note that mumax3 is a command line application, so it is a good idea to add the directory containing the mumax3 executable to the PATH environment variable.

Alternative: building mumax3 from source

The source code of mumax3, with build instructions for Linux and Windows, can be found on GitHub.

3-3.11.1/doc/templates/examples-template.html000066400000000000000000000453261503346766200210450ustar00rootroot00000000000000 {{.Include "head.html"}} {{.Include "header.html"}}

mumax 3.11 examples

These are example input scripts, the full API can be found here.
A more in-depth tutorial with video recordings can be found here.

mumax3 input files are run with the command
mumax3 myfile.mx3
Output is automatically stored in the "myfile.out" directory. Additionally, a web interface provides live output. Default is http://localhost:35367.
For more details, run mumax3 -help which will show the available command-line flags (e.g. to select a certain GPU).

    Getting started with Standard Problem #4

    Let's start with the classic mumag standard problem 4, as defined here. {{.Example ` SetGridsize(128, 32, 1) SetCellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) relax() save(m) // relaxed state autosave(m, 200e-12) tableautosave(10e-12) B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) `}}

    This example should be pretty straight-forward to follow. Space-dependent output is stored in OVF format, which is compatible with OOMMF and can be converted with mumax3-convert. Below is the output converted to PNG.

    The data table is stored in a simple text format compatible with gnuplot, like used for the plot below.

    {{.Output }}

    Standard Problem #2

    Using the scripting language explained above, relatively complex input files can be easily defined. E.g. micromagnetic standard problem #2 specifies the simulation size in exchange lengths. The script below calculates the exchange length and chooses cells not larger than 0.75 exchange lengths so that the number of cells is a power of two (for best performance). {{.Example ` Msat = 1000e3 Aex = 10e-12 // define exchange length lex := sqrt(10e-12 / (0.5 * mu0 * pow(1000e3 ,2))) d := 30 * lex // we test for d/lex = 30 Sizex := 5*d // magnet size x Sizey := 1*d Sizez := 0.1*d nx := pow(2, ilogb(Sizex / (0.75*lex))) // power-of-two number of cells ny := pow(2, ilogb(Sizey / (0.75*lex))) // not larger than 0.75 exchange lengths SetGridSize(nx, ny, 1) SetCellSize(Sizex/nx, Sizey/ny, Sizez) m = Uniform(1, 0.1, 0) // initial mag relax() save(m) // remanent magnetization print(" for d/lex=30: ", m.average()) `}} {{.Output}} This example saves and prints the remanent magnetization state so we can verify it against known values.

    Hysteresis

    Below is an example of a hysteresis loop where we step the applied field in small increments and find the magnetization ground state after each step. Minimize() finds the ground state using the conjugate gradient method, which is very fast. However, this method might fail on very high energy initial states like a random magnetization. In that case, Relax() is more robust (albeit much slower). {{.Example ` SetGridsize(128, 32, 1) SetCellsize(4e-9, 4e-9, 30e-9) Msat = 800e3 Aex = 13e-12 m = randomMag() relax() // high-energy states best minimized by relax() Bmax := 100.0e-3 Bstep := 1.0e-3 MinimizerStop = 1e-6 TableAdd(B_ext) for B:=0.0; B<=Bmax; B+=Bstep{ B_ext = vector(B, 0, 0) minimize() // small changes best minimized by minimize() tablesave() } for B:=Bmax; B>=-Bmax; B-=Bstep{ B_ext = vector(B, 0, 0) minimize() // small changes best minimized by minimize() tablesave() } for B:=-Bmax; B<=Bmax; B+=Bstep{ B_ext = vector(B, 0, 0) minimize() // small changes best minimized by minimize() tablesave() } `}} {{.OutputHysteresis}}

    Geometry

    mumax3 has powerful API to programatically define geometries. A number of primitive shapes are defined, like ellipses, rectangles, etc. They can be transformed (rotated, translated) and combined using boolean logic (add, sub, inverse). All positions are specified in meters and the origin lies in the center of the simulation box. See the full API. Edges can be smoothed to reduce staircase effects. EdgeSmooth=n means samples per cell are used to determine its volume. EdgeSmooth=0 implies a staircase approximation, while EdgeSmooth=8 results in quite accurately resolved edges. {{.Example ` SetGridsize(100, 100, 50) SetCellsize(1e-6/100, 1e-6/100, 1e-6/50) EdgeSmooth = 8 setgeom( rect(800e-9, 500e-9) ) saveas(geom, "rect") setgeom( cylinder(800e-9, inf) ) saveas(geom, "cylinder") setgeom( circle(200e-9).repeat(300e-9, 400e-9, 0) ) saveas(geom, "circle_repeat") setgeom( cylinder(800e-9, inf).inverse() ) saveas(geom, "cylinder_inverse") setgeom( cylinder(800e-9, 600e-9).transl(200e-9, 100e-9, 0) ) saveas(geom, "cylinder_transl") setgeom( ellipsoid(800e-9, 600e-9, 500e-9) ) saveas(geom, "ellipsoid") setgeom( cuboid(800e-9, 600e-9, 500e-9) ) saveas(geom, "cuboid") setgeom( cuboid(800e-9, 600e-9, 500e-9).rotz(-10*pi/180) ) saveas(geom, "cuboid_rotZ") setgeom( layers(0, 25) ) saveas(geom, "layers") setgeom( cell(50, 20, 0) ) saveas(geom, "cell") setgeom( xrange(0, inf) ) saveas(geom, "xrange") a := cylinder(600e-9, 600e-9).transl(-150e-9, 50e-9, 0 ) b := rect(600e-9, 600e-9).transl(150e-9, -50e-9, 0) setgeom( a.add(b) ) saveas(geom, "logicAdd") setgeom( a.sub(b) ) saveas(geom, "logicSub") setgeom( a.intersect(b) ) saveas(geom, "logicAnd") setgeom( a.xor(b) ) saveas(geom, "logicXor") setgeom( imageShape("mask.png") ) saveas(geom, "imageShape") `}} {{.Output}} Note: these are 3D geometries seen from above. The displayed cell filling is averaged along the thickness (notable in ellipse and layers example). Black means empty space, white is filled.

    Initial Magnetization

    Some initial magnetization functions are provided, as well as transformations similar to those on Shapes. See the Config API. {{.Example ` setgridsize(256, 128, 1) setcellsize(5e-9, 5e-9, 5e-9) m = Uniform(1, 1, 0) // no need to normalize length saveas(m, "uniform") m = Vortex(1, -1) // circulation, polarization saveas(m, "vortex") m = TwoDomain(1,0,0, 0,1,0, -1,0,0) // Néel wall saveas(m, "twodomain") m = RandomMag() saveas(m, "randommag") m = TwoDomain(1,0,0, 0,1,0, -1,0,0).rotz(-pi/4) saveas(m, "twodomain_rot") m = VortexWall(1, -1, 1, 1) saveas(m, "vortexwall") m = VortexWall(1, -1, 1, 1).scale(1/2, 1, 1) saveas(m, "vortexwall_scale") m = Vortex(1,-1).transl(100e-9, 50e-9, 0) saveas(m, "vortex_transl") m = Vortex(1,-1).Add(0.1, randomMag()) saveas(m, "vortex_add_random") m = BlochSkyrmion(1, -1).scale(3,3,1) saveas(m, "Bloch_skyrmion") m = NeelSkyrmion(1,-1).scale(3,3,1) saveas(m, "Néel_skyrmion") // set m in only a part of space, or a single cell: m = uniform(1, 1, 1) m.setInShape(cylinder(400e-9, 100e-9), vortex(1, -1)) m.setCell(20, 10, 0, vector(0.1, 0.1, -0.9)) // set in cell index [20,10,0] saveas(m, "setInShape_setCell") //Read m from .ovf file. m.loadfile("myfile.ovf") saveas(m, "loadfile") `}} {{.Output}} These initial states are approximate, after setting them it is a good idea to relax the magnetization to the actual ground state. The magnetization can also be set in separate regions, see below.

    Interlude: Rotating Cheese

    In this example we define a geometry that looks like a slice of cheese and have it rotate in time. {{.Example ` setgridsize(128, 128, 1) setcellsize(2e-9, 2e-9, 2e-9) d := 200e-9 sq := rect(d, d) // square with side d h := 50e-9 hole := cylinder(h, h) // circle with diameter h hole1 := hole.transl(100e-9, 0, 0) // translated circle #1 hole2 := hole.transl(0, -50e-9, 0) // translated cricle #2 cheese:= sq.sub(hole1).sub(hole2)// subtract the circles from the square (makes holes). setgeom(cheese) msat = 600e3 aex = 12e-13 alpha = 3 // rotate the cheese. for i:=0; i<=90; i=i+30{ angle := i*pi/180 setgeom(cheese.rotz(angle)) m = uniform(cos(angle), sin(angle), 0) minimize() save(m) } `}} {{.Output}}

    Regions: Space-dependent Parameters

    Space-dependent parameters are defined using material regions. Regions are numbered 0-255 and represent different materials. Each cell can belong to only one region. At the start of a simulation all cells have region number 0.

    Regions are defined with defregion(number, shape), where shape is explained in the geometry example.

    When you're not using regions, like in the above examples, you'll probably set parameters with a simple assign:

    Aex = 12e-13
    Behind the screens, this sets Aex in all regions.

    It's always a good idea to output the regions quantity, as well as all your material parameters.

    {{.Example ` N := 128 setgridsize(N, N, 1) c := 4e-9 setcellsize(c, c, c) // disk with different anisotropy in left and right half setgeom(circle(N*c)) defregion(1, xrange(0, inf)) // left half defregion(2, xrange(-inf, 0)) // right half save(regions) Ku1.setregion(1, .1e6) anisU.setRegion(1, vector(1, 0, 0)) Ku1.setregion(2, .2e6) anisU.setRegion(2, vector(0, 1, 0)) save(Ku1) save(anisU) Msat = 800e3 // sets it everywhere save(Msat) Aex = 12e-13 alpha = 1 m.setRegion(1, uniform(1, 1, 0)) m.setRegion(2, uniform(-1, 1, 0)) saveas(m, "m_inital") run(.1e-9) saveas(m, "m_final") `}} {{.Output}}

    Slicing and dicing output

    The example below illustrates how to save only the part of the output you're interested in. {{.Example ` Nx := 256 Ny := 256 Nz := 1 setgridsize(Ny, Nx, Nz) c := 4e-9 setcellsize(c, c, c) setgeom(circle(Nx*c)) Msat = 800e3 Aex = 12e-13 alpha = 1 m = vortex(1, 1) save(m) save(m.Comp(0)) save(Crop(m, 0, Nx/2, 0, Ny/2, 0, Nz)) mx := m.Comp(0) mx_center := CropY(mx, Ny/4, 3*Ny/4) save(mx_center) `}} {{.Output}}

    Magnetic Force Microscopy

    Mumax3 has built-in generation of MFM images from the magnetization. The MFM tip lift can be freely chosen. By default the tip magnetization is modeled as a point monopole at the apex. This is sufficient for most situations. Nevertheless, it is also possible to model partially magnetized tips by setting MFMDipole to the magnetized portion of the tip, in meters. E.g., if only the first 20nm of the tip is (vertically) magnetized, set MFMDipole=20e-9.

    {{.Example ` setgridsize(256, 256, 1) setcellsize(2e-9, 2e-9, 1e-9) setgeom(rect(400e-9, 400e-9)) msat = 600e3 aex = 10e-12 m = vortex(1, 1) relax() save(m) MFMLift = 10e-9 saveas(MFM, "lift_10nm") MFMLift = 40e-9 saveas(MFM, "lift_40nm") MFMLift = 90e-9 saveas(MFM, "lift_90nm") `}} {{.Output}}

    PMA Racetrack

    In this example we drive a domain wall in PMA material by spin-transfer torque. We set up a post-step function that makes the simulation box "follow" the domain wall. Like this, only a small number of cells is needed to simulate an infinitely long magnetic wire. {{.Example ` setGridSize(128, 128, 1) setCellSize(2e-9, 2e-9, 1e-9) Msat = 600e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.59e6 alpha = 0.02 Xi = 0.2 m = twoDomain(0, 0, 1, 1, 1, 0, 0, 0, -1) // up-down domains with wall between Bloch and Néél type relax() // Set post-step function that centers simulation window on domain wall. ext_centerWall(2) // keep m[2] (= m_z) close to zero // Schedule output autosave(m, 100e-12) // Run for 1ns with current through the sample j = vector(1.5e13, 0, 0) pol = 1 run(.5e-9) `}} {{.Output}} Since we center on the domain wall we can not see that it is actually moving, but the domain wall breakdown is visible.

    Py Racetrack

    In this example we drive a vortex wall in Permalloy by spin-transfer torque. The simulation box "follows" the domain wall. By removing surface charges at the left and right ends, we mimic an infintely long wire. {{.Example ` setGridSize(256, 64, 1) setCellSize(3e-9, 3e-9, 10e-9) Msat = 860e3 Aex = 13e-12 Xi = 0.1 alpha = 0.02 m = twodomain(1,0,0, 0,1,0, -1,0,0) notches := rect(15e-9, 15e-9).RotZ(45*pi/180).Repeat(200e-9, 64*3e-9, 0).Transl(0, 32*3e-9, 0) setGeom(notches.inverse()) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. BoundaryRegion := 0 MagLeft := 1 MagRight := -1 ext_rmSurfaceCharge(BoundaryRegion, MagLeft, MagRight) relax() ext_centerWall(0) // keep m[0] (m_x) close to zero // Schedule output autosave(m, 50e-12) tableadd(ext_dwpos) // domain wall position tableautosave(10e-12) // Run the simulation with current through the sample pol = 0.56 J = vector(-10e12, 0, 0) Run(0.5e-9) `}} {{.Output}} Since we center on the domain wall we can not really see the motion, despite the vortex wall moving pretty fast. Note the absence of closure domains at the edges due to the surface charges being removed there.

    Voronoi tessellation

    In this example we use regions to specify grains in a material. The built-in extension ext_makegrains is used to define grain-like regions using Voronoi tessellation. We vary the material parameters in each grain. {{.Example ` N := 256 c := 4e-9 d := 40e-9 setgridsize(N, N, 1) setcellsize(c, c, d) setGeom(circle(N*c)) // define grains with region number 0-255 grainSize := 40e-9 // m randomSeed := 1234567 maxRegion := 255 ext_makegrains(grainSize, maxRegion, randomSeed) defregion(256, circle(N*c).inverse()) // region 256 is outside, not really needed alpha = 3 Kc1 = 1000 Aex = 13e-12 Msat = 860e3 // set random parameters per region for i:=0; i

    RKKY

    Scaling the exchange coupling between regions can be used to obtain antiferromagnetic coupling like the RKKY interaction. In that case we only model the magnetic layers and do not explicitly add a spacer layer (which is negligibly thin). We scale the exchange coupling to get the desired RKKY strength: scale = (RKKY * cellsize_z) / (2 * Aex). {{.Example ` N := 10 setgridsize(N, N, 2) c := 1e-9 setcellsize(c, c, c) defRegion(0, layer(0)) defRegion(1, layer(1)) Msat = 1e6 Aex = 10e-12 RKKY := -1e-3 // 1mJ/m2 scale := (RKKY * c) / (2 * Aex.Average()) ext_scaleExchange(0, 1, scale) tableAdd(E_total) m.setRegion(0, uniform(1, 0, 0)) for ang:=0; ang<360; ang++{ m.setRegion(1, uniform(cos(ang*pi/180), sin(ang*pi/180), 0)) t = ang * 1e-9 // output "time" is really angle tablesave() } `}} {{.Output}}

    Slonczewski STT

    Example of a spin-torque MRAM stack consisting of a fixed layer, spacer and free layer. Only the free layer magnetization is explicitly modeled, so we use a 2D grid. The fixed layer polarization is set with FixedLayer = ..., which can be space-dependent. The spacer layer properties are modeled by setting the parameters Lambda and EpsilonPrime. Finally Pol sets the current polarization and J the current density, which should be along z in this case. Below we switch an MRAM bit. {{.Example ` // geometry sizeX := 160e-9 sizeY := 80e-9 sizeZ := 5e-9 Nx := 64 Ny := 32 setgridsize(Nx, Ny, 1) setcellsize(sizeX/Nx, sizeY/Ny, sizeZ) setGeom(ellipse(sizeX, sizeY)) // set up free layer Msat = 800e3 Aex = 13e-12 alpha = 0.01 m = uniform(1, 0, 0) // set up spacer layer parameters lambda = 1 Pol = 0.5669 epsilonprime = 0 // set up fixed layer polarization angle := 20 px := cos(angle * pi/180) py := sin(angle * pi/180) fixedlayer = vector(px, py, 0) // send current Jtot := -0.008 // total current in A area := sizeX*sizeY*pi/4 jc := Jtot / area // current density in A/m2 J = vector(0, 0, jc) // schedule output & run autosave(m, 100e-12) tableautosave(10e-12) run(1e-9) `}} {{.Output}}

    Spinning hard disk

    Using the Shift function, we can shift the system (magnetization, regions and geometry) by a given number of cells. Here we use this feature to simulate a moving hard disk platter. A time-dependent gaussian field profile mimics the write field. {{.Example ` Nx := 512 Ny := 128 c := 5e-9 setgridsize(Nx, Ny, 1) setcellsize(c, c, c) ext_makegrains(30e-9, 256, 0) // PMA material Ku1 = 0.4e6 Aex = 10e-12 Msat = 600e3 alpha = 1 delta := 0.2 // anisotropy variation for i:=0; i<256; i++{ // random cubic anisotropy direction AnisU.SetRegion(i, vector(delta*(rand()-0.5), delta*(rand()-0.5), 1)) // strongly reduce exchange coupling between grains for j:=i+1; j<256; j++{ ext_scaleExchange(i, j, 0.1) } } m = uniform(0, 0, 1) // Gaussian external field profile mask := newVectorMask(Nx, Ny, 1) for i:=0; i 3-3.11.1/doc/templates/head.html000066400000000000000000000011351503346766200163050ustar00rootroot00000000000000 mumax3 3-3.11.1/doc/templates/header.html000066400000000000000000000015741503346766200166430ustar00rootroot00000000000000


    3-3.11.1/doc/templates/headerpage-template.html000066400000000000000000000001421503346766200212770ustar00rootroot00000000000000 {{.Include "head.html"}} {{.Include "header.html"}} 3-3.11.1/doc/templates/index-template.html000066400000000000000000000077531503346766200203400ustar00rootroot00000000000000 {{.Include "head.html"}} {{.Include "header.html"}}

    mumax3 is a GPU-accelerated micromagnetic simulation program developed and maintained at the DyNaMat group at Ghent University.

    A speed-up of the order of 100x compared to CPU-based simulations can easily be reached, even with relatively inexpensive gaming GPUs. Additionally, the software is optimized for low memory use and can handle about 16 million FD cells with 2GB of GPU RAM.

    Citations and licence

    If you use mumax3 in any work or publication, we kindly ask you to cite the references suggested for your specific simulation in the terminal window and in the "references.bib" file found in the outputfolder.

    mumax3 is open-source software. You are free to modify and distribute the source code under the GPLv3 licence.

    Web interface showing the spatial magnetization.

    Features

    • Landau-Lifshitz micromagnetic formalism
    • Magnetostatic field
    • Heisenberg exchange
    • Arbitrary inter-region exchange like RKKY coupling
    • Dzyaloshinskii-Moriya interaction
    • Spin-transfer torque (Zhang-Li and Slonczewski)
    • Uniaxial and cubic magnetocrystalline anisotropy
    • Thermal fluctuations (Brown)
    • Voronoi tessellation
    • Time- and space-dependent material parameters
    • Arbitrary complex excitation (field, current)
    • Simulation window can automatically follow a moving domain wall
    • Edge charges can be removed to simulate an infinitely long geometry
    • Optional 1D, 2D or 3D periodic boundary conditions

    Web GUI

    mumax3 includes a browser-based user interface that lets you follow a running simulation or modify it on-the-fly, be it on your local machine or remotely.

    Simple scripting

    mumax3 provides simple yet powerful input scripting.
    E.g., the following example applies a time-dependent external field to a uniform magnet (FMR experiment).

    SetGridSize(128, 32, 4)
    SetCellSize(5e-9, 5e-9, 5e-9)
    Msat  = 860e3
    Aex   = 13e-12
    alpha = 0.2
    m = Uniform(1, 1, 0)
    
    f := 1e9  // 1GHz
    A := 0.01 // 10mT
    B_ext = Vector(0.1, A*sin(2*pi*f*t), 0)
    
    run(10e-9)
    
    Web interface can view and set parameters on-the-fly.

    GPU/driver requirements

    mumax3 is cross-platform and runs on Linux, Windows and Mac platforms. You need an NVIDIA GPU with compute capability 5.0 or higher, as listed here. You also need to use NVIDIA's proprietary graphics driver, which may already be installed on your system. The benchmark below may guide your GPU choice.

    mumax3 GPU perfomance for 2D simulations containing 4 million cells.

    3-3.11.1/doc/tex/000077500000000000000000000000001503346766200133205ustar00rootroot000000000000003-3.11.1/doc/tex/.gitignore000066400000000000000000000000141503346766200153030ustar00rootroot00000000000000*.aux *.pdf 3-3.11.1/doc/tex/Makefile000066400000000000000000000002761503346766200147650ustar00rootroot00000000000000mumax3.pdf: mumax3.tex pdflatex -halt-on-error mumax3.tex pdflatex -halt-on-error mumax3.tex .PHONY: clean clean: rm -f *.aux *.bbl *.blg *.ind *.ilg *.log *.toc *.out mumax3.pdf *.idx 3-3.11.1/doc/tex/mumax3.tex000066400000000000000000000067511503346766200152650ustar00rootroot00000000000000\documentclass[12pt]{article} \usepackage{a4wide} \usepackage{amsmath} \newcommand{\vc}[1]{\ensuremath{\vec{\textbf{#1}}}} \newcommand{\ofrt}{\ensuremath{\left(\vc{r},t \right)}} \newcommand{\m}{\vc{m}} \newcommand{\M}{\vc{M}} \newcommand{\Ms}{M_\mathrm{s}} \newcommand{\B}[1]{\vc{B}_\mathrm{#1}} \newcommand{\Beff}{\B{eff}} \newcommand{\tq}[1]{\vc{\ensuremath{\tau}}\ensuremath{_\mathrm{#1}}} \newcommand{\damp}{\ensuremath{\alpha}} \newcommand{\Kern}{\vec{\vec{\textbf{K}}}} \newcommand{\FFT}{\mathcal{F}} \newcommand{\hspin}{(\vc{u}\cdot\nabla)\vc{m}} \begin{document} \pagestyle{empty} We solve: \begin{eqnarray*} \frac{\partial \m}{\partial t} &=& \gamma_0 \left( \tq{LL} + \tq{STT} \right) \end{eqnarray*} \vspace{3cm} With boundary conditions: \begin{eqnarray*} \left.\frac{\partial m_x}{\partial x}\right|_{\partial V} &=& -\frac{D}{2A}m_z \\ \left.\frac{\partial m_y}{\partial x}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_z}{\partial x}\right|_{\partial V} &=& \frac{D}{2A}m_x\\ \left.\frac{\partial m_x}{\partial y}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_y}{\partial y}\right|_{\partial V} &=& -\frac{D}{2A}m_z \\ \left.\frac{\partial m_z}{\partial y}\right|_{\partial V} &=& \frac{D}{2A}m_y\\ \left.\frac{\partial m_x}{\partial z}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_y}{\partial z}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_z}{\partial z}\right|_{\partial V} &=& 0\\ \end{eqnarray*} Where: \begin{eqnarray*} \tq{LL} &=& \frac{1}{1+\damp^2} \left( \m \times \Beff +\damp\left( \m \times \left( \m \times \Beff \right)\right) \right)\\ \alpha &=& \alpha\ofrt \\ \m &=& \frac{\M\ofrt}{\Ms} \\ \Ms &=& {\left| \M\ofrt\right|} \\ \Beff &=& \B{d} + \B{ex} + \B{z} + \B{a} + \B{th} \\ \B{d}\ofrt &=& \iiint_V \Kern(\vc{r} - \vc{r}') \cdot \mu_0 \M(\vc{r'}, t) \mathrm{d}^3\vc{r}' \\ & = & \FFT^{-1} \left( \FFT\left(\Kern(\vc{r})\right) \cdot \FFT\left({\mu_0\M\ofrt}\right) \right) \\ \vc{K}_i(\vc{r}) & = & \frac{1}{4\pi}\left(\frac{3(\vc{e}_i\cdot\hat{\vc{r}})\hat{\vc{r}}-\vc{e}_i}{r^3}\right) + \frac{2}{3}\vc{e}_i\delta^3(\vc{r}) \\ \B{ex} &=& \frac{2 A}{\Ms} \Delta \m + \frac{2D}{\Ms} \left(\frac{\partial m_z}{\partial x},\ \frac{\partial m_z}{\partial y},\ -\frac{\partial m_x}{\partial x}-\frac{\partial m_y}{\partial y}\right) \\ A &=& A\ofrt \\ \B{z} &=& \B{z}\ofrt \\ \B{a} &=& \B{u} + \B{c} \\ \B{u} &=& 2 K_\mathrm{u1} \left( \m \cdot \vc{u} \right) \vc{u} \\ K_\mathrm{u1} &=& K_\mathrm{u1}\ofrt\\ \vc{u} &=& \vc{u}\ofrt \\ \vc{B}_{\mathrm{c}i} &=& \left( A_{cx} c_{1i} + A_{cy} c_{2i} + A_{cz} c_{3i} \right) \\ \vc{A}_{c} &=& K_{c1} \left(a_1(a_2^2+a_3^2),\ a_2(a_1^2+a_3^2),\ a_3(a_1^2+a_2^2)\right)\\ a_i &=& \vc{c}_i \cdot \m \\ \vc{c}_1 &=& \vc{c}_1 \ofrt \\ \vc{c}_2 &=& \vc{c}_2 \ofrt \\ \vc{c}_3 &=& \vc{c}_1 \times \vc{c}_2 \\ \B{th}\ofrt &=& \eta \ofrt \sqrt{ \frac{k_B^2 \alpha T} {\mu_0\gamma_0\Ms \Delta V \Delta t }} \\ \tq{STT} &=& \tq{ZL} + \tq{SL} \\ \tq{ZL} &=& \frac{1}{1+\alpha^2} \left( \left(1+\xi\alpha\right) \m \times \left(\m \times \hspin \right) + \left(\xi-\alpha\right)\vc{m}\times \hspin \right) \\ \vc{u} &=& \frac{\mu_B \mu_0}{2 e \gamma_0 B_\mathrm{s} (1 + \xi^2)} \vc{j}\\ \vc{j} &=& \vc{j}\ofrt \\ \tq{SL} &=& \beta\epsilon (\m \times \m_P \times \m) - \beta\epsilon' \m\times \m_P \\ \beta &=& \frac{j_z \hbar}{ \Ms e d} \\ \epsilon &=& \frac{P\ofrt \Lambda^2}{(\Lambda^2 + 1)+ (\Lambda^2-1)(\m\cdot\m_P)} \\ \epsilon' &=& \epsilon'\ofrt\\ \end{eqnarray*} \end{document} 3-3.11.1/draw/000077500000000000000000000000001503346766200127105ustar00rootroot000000000000003-3.11.1/draw/Makefile000066400000000000000000000000241503346766200143440ustar00rootroot00000000000000all: go install -v 3-3.11.1/draw/arrows.go000066400000000000000000000041301503346766200145520ustar00rootroot00000000000000package draw import ( "github.com/mumax/3/data" "github.com/mumax/3/freetype/raster" "image" "image/color" "math" ) func drawArrows(img *image.RGBA, arr [3][][][]float32, sub int) { c := NewCanvas(img) Na := data.SizeOf(arr[0]) // number of arrows h := Na[Y] // orignal image height Na[X] = imax(Na[X]/sub, 1) Na[Y] = imax(Na[Y]/sub, 1) Na[Z] = 1 small := data.Downsample(arr[:], Na) S := float32(sub) for iy := 0; iy < Na[Y]; iy++ { Ay := float32(h) - (float32(iy)+0.5)*S for ix := 0; ix < Na[X]; ix++ { Ax := (float32(ix) + 0.5) * S mx := small[X][0][iy][ix] my := small[Y][0][iy][ix] mz := small[Z][0][iy][ix] c.Arrow(Ax, Ay, mx, my, mz, float32(sub)) } } c.rasterizer.Rasterize(c.RGBAPainter) c.rasterizer.Clear() } // A Canvas is used to draw on. type Canvas struct { *image.RGBA *raster.RGBAPainter rasterizer *raster.Rasterizer } // Make a new canvas of size w x h. func NewCanvas(img *image.RGBA) *Canvas { c := new(Canvas) c.RGBA = img c.RGBAPainter = raster.NewRGBAPainter(c.RGBA) c.rasterizer = raster.NewRasterizer(img.Bounds().Max.X, img.Bounds().Max.Y) c.rasterizer.UseNonZeroWinding = true c.SetColor(color.RGBA{0, 0, 0, 100}) return c } func (c *Canvas) Arrow(x, y, mx, my, mz, size float32) { arrlen := 0.4 * size arrw := 0.2 * size norm := float32(math.Sqrt(float64(mx*mx + my*my + mz*mz))) if norm == 0 { return } if norm > 1 { norm = 1 } theta := math.Atan2(float64(my), float64(mx)) cos := float32(math.Cos(theta)) sin := float32(math.Sin(theta)) r1 := arrlen * norm * float32(math.Cos(math.Asin(float64(mz)))) r2 := arrw * norm pt1 := pt((r1*cos)+x, -(r1*sin)+y) pt2 := pt((r2*sin-r1*cos)+x, -(-r2*cos-r1*sin)+y) pt3 := pt((-r2*sin-r1*cos)+x, -(r2*cos-r1*sin)+y) var path raster.Path path.Start(pt1) path.Add1(pt2) path.Add1(pt3) path.Add1(pt1) c.rasterizer.AddPath(path) } func pt(x, y float32) raster.Point { return raster.Point{fix32(x), fix32(y)} } func fix32(x float32) raster.Fix32 { return raster.Fix32(int(x * (1 << 8))) } func imax(a, b int) int { if a > b { return a } else { return b } } 3-3.11.1/draw/colorscale.go000066400000000000000000000024171503346766200153710ustar00rootroot00000000000000package draw import "image/color" import "fmt" type ColorMapSpec struct { Cmap []color.RGBA Ccomp int } func ColorMap(min, max, value float32, colormap ...color.RGBA) color.RGBA { // default colormap: black-white if len(colormap) < 1 { colormap = []color.RGBA{{0, 0, 0, 255}, {255, 255, 255, 255}} } // map value to interval [O,1] val := float64((value - min) / (max - min)) if val > 1 { val = 1 } if val < 0 { val = 0 } // find index of color below our value maxIndex := float64(len(colormap) - 1) index := val * maxIndex // corner case val==max: if index == maxIndex { index-- } // get two neighboring colors i := int(index) if i < 0 { i = 0 } if i >= len(colormap)-1 { i = len(colormap) - 2 } c1 := colormap[i] c2 := colormap[i+1] // location between two neighboring colors [0..1] x := (val - float64(i)/maxIndex) * maxIndex if x < 0 || x > 1 { panic(fmt.Sprint("x=", x)) } // interpolate between colors r := (1-x)*float64(c1.R) + x*float64(c2.R) g := (1-x)*float64(c1.G) + x*float64(c2.G) b := (1-x)*float64(c1.B) + x*float64(c2.B) a := (1-x)*float64(c1.A) + x*float64(c2.A) return color.RGBA{bte(r), bte(g), bte(b), bte(a)} } func bte(x float64) uint8 { if x < 0 { return 0 } if x > 255 { return 255 } return uint8(x) } 3-3.11.1/draw/doc.go000066400000000000000000000000551503346766200140040ustar00rootroot00000000000000// 2D rendering of data slices. package draw 3-3.11.1/draw/encode.go000066400000000000000000000031131503346766200144720ustar00rootroot00000000000000package draw import ( "bufio" "fmt" "github.com/mumax/3/data" "image" "image/gif" "image/jpeg" "image/png" "io" "os" "path" "strings" ) func RenderFile(fname string, f *data.Slice, min, max string, arrowSize int, colormap ...ColorMapSpec) error { out, err := os.Create(fname) if err != nil { return err } defer out.Close() return RenderFormat(out, f, min, max, arrowSize, fname, colormap...) } func RenderFormat(out io.Writer, f *data.Slice, min, max string, arrowSize int, format string, colormap ...ColorMapSpec) error { var codecs = map[string]codec{".png": PNG, ".jpg": JPEG100, ".gif": GIF256} ext := strings.ToLower(path.Ext(format)) enc := codecs[ext] if enc == nil { return fmt.Errorf("render: unhandled image type: " + ext) } return Render(out, f, min, max, arrowSize, enc, colormap...) } // encodes an image type codec func(io.Writer, image.Image) error // Render data and encode with arbitrary codec. func Render(out io.Writer, f *data.Slice, min, max string, arrowSize int, encode codec, colormap ...ColorMapSpec) error { img := Image(f, min, max, arrowSize, colormap...) buf := bufio.NewWriter(out) defer buf.Flush() return encode(buf, img) } // full-quality jpeg codec, passable to Render() func JPEG100(w io.Writer, img image.Image) error { return jpeg.Encode(w, img, &jpeg.Options{100}) } // full quality gif coded, passable to Render() func GIF256(w io.Writer, img image.Image) error { return gif.Encode(w, img, &gif.Options{256, nil, nil}) } // png codec, passable to Render() func PNG(w io.Writer, img image.Image) error { return png.Encode(w, img) } 3-3.11.1/draw/hslscale.go000066400000000000000000000024601503346766200150370ustar00rootroot00000000000000package draw import ( "image/color" "math" ) // Colormap for 3D vector data. func HSLMap(x, y, z float32) color.RGBA { s := sqrtf(x*x + y*y + z*z) l := 0.5*z + 0.5 h := float32(math.Atan2(float64(y), float64(x))) return HSLtoRGB(h, s, l) } // h = 0..2pi, s=0..1, l=0..1 func HSLtoRGB(h, s, l float32) color.RGBA { if s > 1 { s = 1 } if l > 1 { l = 1 } h = h * (180.0 / math.Pi / 60.0) for h < 0 { h += 6 } for h >= 6 { h -= 6 } var c float32 // chroma if l <= 0.5 { c = 2 * l * s } else { c = (2 - 2*l) * s } x := c * (1 - abs(fmod(h, 2)-1)) var r, g, b float32 switch { case 0 <= h && h < 1: r, g, b = c, x, 0. case 1 <= h && h < 2: r, g, b = x, c, 0. case 2 <= h && h < 3: r, g, b = 0., c, x case 3 <= h && h < 4: r, g, b = 0, x, c case 4 <= h && h < 5: r, g, b = x, 0., c case 5 <= h && h < 6: r, g, b = c, 0., x } m := l - 0.5*c r, g, b = r+m, g+m, b+m R, G, B := uint8(255*r), uint8(255*g), uint8(255*b) return color.RGBA{R, G, B, 255} } // modulo func fmod(number, mod float32) float32 { for number < mod { number += mod } for number >= mod { number -= mod } return number } func abs(number float32) float32 { if number < 0 { return -number } // else return number } func sqrtf(x float32) float32 { return float32(math.Sqrt(float64(x))) } 3-3.11.1/draw/image.go000066400000000000000000000064131503346766200143250ustar00rootroot00000000000000package draw import ( "github.com/mumax/3/data" "github.com/mumax/3/util" "image" "image/color" "log" "strconv" ) // Renders an image of slice. fmin, fmax = "auto" or a number to set the min/max color scale. func Image(f *data.Slice, fmin, fmax string, arrowSize int, colormap ...ColorMapSpec) *image.RGBA { img := new(image.RGBA) On(img, f, fmin, fmax, arrowSize, colormap...) return img } // Render on existing image buffer. Resize it if needed func On(img *image.RGBA, f *data.Slice, fmin, fmax string, arrowSize int, colormap ...ColorMapSpec) { dim := f.NComp() switch dim { default: log.Fatalf("unsupported number of components: %v", dim) case 3: if colormap == nil { drawVectors(img, f.Vectors(), arrowSize) break } if colormap[0].Ccomp >= 0 { ff := f.Comp(colormap[0].Ccomp) min, max := parseMinMax(ff, fmin, fmax) drawFloats(img, ff.Scalars(), min, max, colormap[0].Cmap...) if arrowSize > 0 { drawArrows(img, f.Vectors(), arrowSize) } } else { drawVectors(img, f.Vectors(), arrowSize) } case 1: min, max := parseMinMax(f, fmin, fmax) if colormap == nil { drawFloats(img, f.Scalars(), min, max) } else { drawFloats(img, f.Scalars(), min, max, colormap[0].Cmap...) } } } func parseMinMax(f *data.Slice, fmin, fmax string) (min, max float32) { min, max = extrema(f.Host()[0]) if fmin != "auto" { m, err := strconv.ParseFloat(fmin, 32) if err != nil { util.Fatal("draw: scale:", err) } min = float32(m) } if fmax != "auto" { m, err := strconv.ParseFloat(fmax, 32) if err != nil { util.Fatal("draw: scale:", err) } max = float32(m) } if min == max { min -= 1 max += 1 // make it gray instead of black } return } // Draws rank 4 tensor (3D vector field) as image // averages data over X (usually thickness of thin film) func drawVectors(img *image.RGBA, arr [3][][][]float32, arrowSize int) { w, h := len(arr[X][0][0]), len(arr[X][0]) d := len(arr[X]) norm := float32(d) *img = *recycle(img, w, h) for iy := 0; iy < h; iy++ { for ix := 0; ix < w; ix++ { var x, y, z float32 = 0., 0., 0. for iz := 0; iz < d; iz++ { x += arr[0][iz][iy][ix] y += arr[1][iz][iy][ix] z += arr[2][iz][iy][ix] } x /= norm y /= norm z /= norm img.Set(ix, (h-1)-iy, HSLMap(x, y, z)) } } if arrowSize > 0 { drawArrows(img, arr, arrowSize) } } func extrema(data []float32) (min, max float32) { min = data[0] max = data[0] for _, d := range data { if d < min { min = d } if d > max { max = d } } return } // Draws rank 3 tensor (3D scalar field) as image // averages data over X (usually thickness of thin film) func drawFloats(img *image.RGBA, arr [][][]float32, min, max float32, colormap ...color.RGBA) { w, h := len(arr[0][0]), len(arr[0]) d := len(arr) *img = *recycle(img, w, h) for iy := 0; iy < h; iy++ { for ix := 0; ix < w; ix++ { var v float32 = 0. for iz := 0; iz < d; iz++ { v += arr[iz][iy][ix] } v /= float32(d) img.Set(ix, (h-1)-iy, ColorMap(min, max, v, colormap...)) } } } // recycle image if it has right size func recycle(img *image.RGBA, w, h int) *image.RGBA { if img == nil || img.Bounds().Size().X != w || img.Bounds().Size().Y != h { img = image.NewRGBA(image.Rect(0, 0, w, h)) } return img } const ( X = 0 Y = 1 Z = 2 ) 3-3.11.1/draw/svg.go000066400000000000000000000024211503346766200140350ustar00rootroot00000000000000package draw import ( "fmt" "github.com/mumax/3/svgo" "io" "math" ) // Renders svg image of vector data. func SVG(out io.Writer, arr [3][][][]float32) { h, w := len(arr[0][0]), len(arr[0][0][0]) const ( r1 = 1. / 2. // arrow half length r2 = 1. / 4. // arrow half width ) canvas := svg.New(out) canvas.Start(w, h) for slice := 0; slice < len(arr[0]); slice++ { Mx := arr[X][slice] My := arr[Y][slice] Mz := arr[Z][slice] for i := 0; i < h; i++ { y := float64(h) - (float64(i) + 1./2.) for j := 0; j < w; j++ { x := float64(j) + 1./2. mx := Mx[i][j] my := My[i][j] mz := Mz[i][j] // skip zero-length vectors if mx*mx+my*my+mz*mz == 0 { continue } theta := math.Atan2(float64(my), float64(mx)) c := math.Cos(theta) s := math.Sin(theta) r1 := r1 * math.Cos(math.Asin(float64(mz))) xs := []float64{(r1 * c) + x, (r2*s - r1*c) + x, (-r2*s - r1*c) + x} ys := []float64{-(r1 * s) + y, -(-r2*c - r1*s) + y, -(r2*c - r1*s) + y} col := HSLMap(mx, my, mz) style := "fill:#" + hex(col.R) + hex(col.G) + hex(col.B) canvas.Polygon(xs, ys, style) } } } canvas.End() } func hex(i uint8) string { j := int(i) - 32 // make it a bit darker if j < 0 { j = 0 } return fmt.Sprintf("%02X", j) } 3-3.11.1/dump/000077500000000000000000000000001503346766200127205ustar00rootroot000000000000003-3.11.1/dump/Makefile000066400000000000000000000000241503346766200143540ustar00rootroot00000000000000all: go install -v 3-3.11.1/dump/read.go000066400000000000000000000064071503346766200141710ustar00rootroot00000000000000// legacy dump data format. package dump import ( "fmt" "github.com/mumax/3/data" "github.com/mumax/3/util" "hash" "hash/crc64" "io" "math" "os" "unsafe" ) func Read(in io.Reader) (*data.Slice, data.Meta, error) { r := newReader(in) return r.readSlice() } func ReadFile(fname string) (*data.Slice, data.Meta, error) { f, err := os.Open(fname) if err != nil { return nil, data.Meta{}, err } defer f.Close() return Read(f) } func MustReadFile(fname string) (*data.Slice, data.Meta) { s, t, err := ReadFile(fname) util.FatalErr(err) return s, t } // Reads successive data frames in dump format. type reader struct { in io.Reader crc hash.Hash64 err error } func newReader(in io.Reader) *reader { r := new(reader) r.in = in r.crc = crc64.New(table) return r } func (r *reader) readSlice() (s *data.Slice, info data.Meta, err error) { r.err = nil // clear previous error, if any magic := r.readString() if r.err != nil { return nil, data.Meta{}, r.err } if magic != MAGIC { r.err = fmt.Errorf("dump: bad magic number:%v", magic) return nil, data.Meta{}, r.err } nComp := r.readInt() size := [3]int{} size[2] = r.readInt() // backwards compatible coordinates! size[1] = r.readInt() size[0] = r.readInt() cell := [3]float64{} cell[2] = r.readFloat64() cell[1] = r.readFloat64() cell[0] = r.readFloat64() info.CellSize = cell info.MeshUnit = r.readString() info.Time = r.readFloat64() _ = r.readString() // time unit s = data.NewSlice(nComp, size) info.Name = r.readString() info.Unit = r.readString() precission := r.readUint64() util.AssertMsg(precission == 4, "only single precission supported") if r.err != nil { return } host := s.Tensors() ncomp := s.NComp() for c := 0; c < ncomp; c++ { for iz := 0; iz < size[2]; iz++ { for iy := 0; iy < size[1]; iy++ { for ix := 0; ix < size[0]; ix++ { host[c][iz][iy][ix] = r.readFloat32() } } } } // Check CRC var mycrc uint64 // checksum by this reader if r.crc != nil { mycrc = r.crc.Sum64() } storedcrc := r.readUint64() // checksum from data stream. 0 means not set if r.err != nil { return nil, data.Meta{}, r.err } if r.crc != nil { r.crc.Reset() // reset for next frame } if r.crc != nil && storedcrc != 0 && mycrc != storedcrc { r.err = fmt.Errorf("dump CRC error: expected %16x, got %16x", storedcrc, mycrc) return nil, data.Meta{}, r.err } return s, info, nil } func (r *reader) readInt() int { x := r.readUint64() if uint64(int(x)) != x { r.err = fmt.Errorf("value overflows int: %v", x) } return int(x) } // read until the buffer is full func (r *reader) read(buf []byte) { _, err := io.ReadFull(r.in, buf[:]) if err != nil { r.err = err } if r.crc != nil { r.crc.Write(buf) } } // read a maximum 8-byte string func (r *reader) readString() string { var buf [8]byte r.read(buf[:]) // trim trailing NULs. i := 0 for i = 0; i < len(buf); i++ { if buf[i] == 0 { break } } return string(buf[:i]) } func (r *reader) readFloat64() float64 { return math.Float64frombits(r.readUint64()) } func (r *reader) readUint64() uint64 { var buf [8]byte r.read(buf[:]) return *((*uint64)(unsafe.Pointer(&buf[0]))) } func (r *reader) readFloat32() float32 { var buf [4]byte r.read(buf[:]) return *((*float32)(unsafe.Pointer(&buf[0]))) } 3-3.11.1/dump/write.go000066400000000000000000000054161503346766200144070ustar00rootroot00000000000000package dump import ( "bufio" "hash" "hash/crc64" "io" "math" "os" "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Write the slice to out in binary format. Add time stamp. func Write(out io.Writer, s *data.Slice, info data.Meta) error { w := newWriter(out) // Writes the header. w.writeString(MAGIC) w.writeUInt64(uint64(s.NComp())) size := s.Size() w.writeUInt64(uint64(size[2])) // backwards compatible coordinates! w.writeUInt64(uint64(size[1])) w.writeUInt64(uint64(size[0])) cell := info.CellSize w.writeFloat64(cell[2]) w.writeFloat64(cell[1]) w.writeFloat64(cell[0]) w.writeString(info.MeshUnit) w.writeFloat64(info.Time) w.writeString("s") // time unit w.writeString(info.Name) w.writeString(info.Unit) w.writeUInt64(4) // precision // return header write error before writing data if w.err != nil { return w.err } w.writeData(s) w.writeHash() return w.err } // Write the slice to file in binary format. Add time stamp. func WriteFile(fname string, s *data.Slice, info data.Meta) error { f, err := os.OpenFile(fname, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) if err != nil { return err } defer f.Close() w := bufio.NewWriter(f) defer w.Flush() return Write(w, s, info) } // Write the slice to file in binary format, panic on error. func MustWriteFile(fname string, s *data.Slice, info data.Meta) { err := WriteFile(fname, s, info) util.FatalErr(err) } var table = crc64.MakeTable(crc64.ISO) type writer struct { out io.Writer crc hash.Hash64 err error } func newWriter(out io.Writer) *writer { w := new(writer) w.crc = crc64.New(table) w.out = io.MultiWriter(w.crc, out) return w } const MAGIC = "#dump002" // identifies dump format // Writes the data. func (w *writer) writeData(array *data.Slice) { data := array.Tensors() size := array.Size() ncomp := array.NComp() for c := 0; c < ncomp; c++ { for iz := 0; iz < size[2]; iz++ { for iy := 0; iy < size[1]; iy++ { for ix := 0; ix < size[0]; ix++ { w.writeFloat32(data[c][iz][iy][ix]) } } } } } // Writes the accumulated hash of this frame, closing the frame. func (w *writer) writeHash() { w.writeUInt64(w.crc.Sum64()) w.crc.Reset() } func (w *writer) count(n int, err error) { if err != nil && w.err == nil { w.err = err } } func (w *writer) writeFloat32(x float32) { var bytes []byte bytes = (*[4]byte)(unsafe.Pointer(&x))[:] w.count(w.out.Write(bytes)) } func (w *writer) writeFloat64(x float64) { w.writeUInt64(math.Float64bits(x)) } func (w *writer) writeString(x string) { var buf [8]byte copy(buf[:], x) w.count(w.out.Write(buf[:])) } func (w *writer) writeUInt64(x uint64) { w.count(w.out.Write((*(*[8]byte)(unsafe.Pointer(&x)))[:8])) } // product of elements. func prod(size [3]int) int { return size[0] * size[1] * size[2] } 3-3.11.1/engine/000077500000000000000000000000001503346766200132205ustar00rootroot000000000000003-3.11.1/engine/Makefile000066400000000000000000000000241503346766200146540ustar00rootroot00000000000000all: go install -v 3-3.11.1/engine/anisotropy.go000066400000000000000000000072731503346766200157670ustar00rootroot00000000000000package engine // Magnetocrystalline anisotropy. import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // Anisotropy variables var ( Ku1 = NewScalarParam("Ku1", "J/m3", "1st order uniaxial anisotropy constant") Ku2 = NewScalarParam("Ku2", "J/m3", "2nd order uniaxial anisotropy constant") Kc1 = NewScalarParam("Kc1", "J/m3", "1st order cubic anisotropy constant") Kc2 = NewScalarParam("Kc2", "J/m3", "2nd order cubic anisotropy constant") Kc3 = NewScalarParam("Kc3", "J/m3", "3rd order cubic anisotropy constant") AnisU = NewVectorParam("anisU", "", "Uniaxial anisotropy direction") AnisC1 = NewVectorParam("anisC1", "", "Cubic anisotropy direction #1") AnisC2 = NewVectorParam("anisC2", "", "Cubic anisotropy direction #2") B_anis = NewVectorField("B_anis", "T", "Anisotropy field", AddAnisotropyField) Edens_anis = NewScalarField("Edens_anis", "J/m3", "Anisotropy energy density", AddAnisotropyEnergyDensity) E_anis = NewScalarValue("E_anis", "J", "total anisotropy energy", GetAnisotropyEnergy) ) var ( sZero = NewScalarParam("_zero", "", "utility zero parameter") ) func init() { registerEnergy(GetAnisotropyEnergy, AddAnisotropyEnergyDensity) } func addUniaxialAnisotropyFrom(dst *data.Slice, M magnetization, Msat, Ku1, Ku2 *RegionwiseScalar, AnisU *RegionwiseVector) { if Ku1.nonZero() || Ku2.nonZero() { ms := Msat.MSlice() defer ms.Recycle() ku1 := Ku1.MSlice() defer ku1.Recycle() ku2 := Ku2.MSlice() defer ku2.Recycle() u := AnisU.MSlice() defer u.Recycle() cuda.AddUniaxialAnisotropy2(dst, M.Buffer(), ms, ku1, ku2, u) } } func addCubicAnisotropyFrom(dst *data.Slice, M magnetization, Msat, Kc1, Kc2, Kc3 *RegionwiseScalar, AnisC1, AnisC2 *RegionwiseVector) { if Kc1.nonZero() || Kc2.nonZero() || Kc3.nonZero() { ms := Msat.MSlice() defer ms.Recycle() kc1 := Kc1.MSlice() defer kc1.Recycle() kc2 := Kc2.MSlice() defer kc2.Recycle() kc3 := Kc3.MSlice() defer kc3.Recycle() c1 := AnisC1.MSlice() defer c1.Recycle() c2 := AnisC2.MSlice() defer c2.Recycle() cuda.AddCubicAnisotropy2(dst, M.Buffer(), ms, kc1, kc2, kc3, c1, c2) } } // Add the anisotropy field to dst func AddAnisotropyField(dst *data.Slice) { addUniaxialAnisotropyFrom(dst, M, Msat, Ku1, Ku2, AnisU) addCubicAnisotropyFrom(dst, M, Msat, Kc1, Kc2, Kc3, AnisC1, AnisC2) } // Add the anisotropy energy density to dst func AddAnisotropyEnergyDensity(dst *data.Slice) { haveUnixial := Ku1.nonZero() || Ku2.nonZero() haveCubic := Kc1.nonZero() || Kc2.nonZero() || Kc3.nonZero() if !haveUnixial && !haveCubic { return } buf := cuda.Buffer(B_anis.NComp(), Mesh().Size()) defer cuda.Recycle(buf) // unnormalized magnetization: Mf := ValueOf(M_full) defer cuda.Recycle(Mf) if haveUnixial { // 1st cuda.Zero(buf) addUniaxialAnisotropyFrom(buf, M, Msat, Ku1, sZero, AnisU) cuda.AddDotProduct(dst, -1./2., buf, Mf) // 2nd cuda.Zero(buf) addUniaxialAnisotropyFrom(buf, M, Msat, sZero, Ku2, AnisU) cuda.AddDotProduct(dst, -1./4., buf, Mf) } if haveCubic { // 1st cuda.Zero(buf) addCubicAnisotropyFrom(buf, M, Msat, Kc1, sZero, sZero, AnisC1, AnisC2) cuda.AddDotProduct(dst, -1./4., buf, Mf) // 2nd cuda.Zero(buf) addCubicAnisotropyFrom(buf, M, Msat, sZero, Kc2, sZero, AnisC1, AnisC2) cuda.AddDotProduct(dst, -1./6., buf, Mf) // 3rd cuda.Zero(buf) addCubicAnisotropyFrom(buf, M, Msat, sZero, sZero, Kc3, AnisC1, AnisC2) cuda.AddDotProduct(dst, -1./8., buf, Mf) } } // Returns anisotropy energy in joules. func GetAnisotropyEnergy() float64 { buf := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(buf) cuda.Zero(buf) AddAnisotropyEnergyDensity(buf) return cellVolume() * float64(cuda.Sum(buf)) } 3-3.11.1/engine/asyncio.go000066400000000000000000000022621503346766200152160ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/timer" "github.com/mumax/3/util" "time" ) // Asynchronous I/O queue flushes data to disk while simulation keeps running. // See save.go, autosave.go var ( saveQue chan func() // passes save requests to runSaver for asyc IO queLen util.Atom // # tasks in queue ) const maxOutputQueLen = 16 // number of outputs that can be queued for asynchronous I/O. func init() { DeclFunc("Flush", drainOutput, "Flush all pending output to disk.") saveQue = make(chan func()) go runSaver() } func queOutput(f func()) { if cuda.Synchronous { timer.Start("io") } queLen.Add(1) saveQue <- f if cuda.Synchronous { timer.Stop("io") } } // Continuously executes tasks the from SaveQue channel. func runSaver() { for f := range saveQue { f() queLen.Add(-1) } } // Finalizer function called upon program exit. // Waits until all asynchronous output has been saved. func drainOutput() { if saveQue == nil { return } for queLen.Load() > 0 { select { default: time.Sleep(1 * time.Millisecond) // other goroutine has the last job, wait for it to finish case f := <-saveQue: f() queLen.Add(-1) } } } 3-3.11.1/engine/autosave.go000066400000000000000000000036161503346766200154040ustar00rootroot00000000000000package engine // Bookkeeping for auto-saving quantities at given intervals. import "fmt" var ( output = make(map[Quantity]*autosave) // when to save quantities autonum = make(map[string]int) // auto number for out file ) func init() { DeclFunc("AutoSave", AutoSave, "Auto save space-dependent quantity every period (s).") DeclFunc("AutoSnapshot", AutoSnapshot, "Auto save image of quantity every period (s).") } // Periodically called by run loop to save everything that's needed at this time. func DoOutput() { for q, a := range output { if a.needSave() { a.save(q) a.count++ } } if Table.needSave() { Table.Save() } } // Register quant to be auto-saved every period. // period == 0 stops autosaving. func AutoSave(q Quantity, period float64) { autoSave(q, period, Save) } // Register quant to be auto-saved as image, every period. func AutoSnapshot(q Quantity, period float64) { autoSave(q, period, Snapshot) } // register save(q) to be called every period func autoSave(q Quantity, period float64, save func(Quantity)) { if period == 0 { delete(output, q) } else { output[q] = &autosave{period, Time, -1, save} // init count to -1 allows save at t=0 } } // generate auto file name based on save count and FilenameFormat. E.g.: // // m000001.ovf func autoFname(name string, format OutputFormat, num int) string { return fmt.Sprintf(OD()+FilenameFormat+"."+StringFromOutputFormat[format], name, num) } // keeps info needed to decide when a quantity needs to be periodically saved type autosave struct { period float64 // How often to save start float64 // Starting point count int // Number of times it has been autosaved save func(Quantity) // called to do the actual save } // returns true when the time is right to save. func (a *autosave) needSave() bool { t := Time - a.start return a.period != 0 && t-float64(a.count)*a.period >= a.period } 3-3.11.1/engine/average.go000066400000000000000000000021661503346766200151660ustar00rootroot00000000000000package engine // Averaging of quantities over entire universe or just magnet. import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // average of quantity over universe func qAverageUniverse(q Quantity) []float64 { s := ValueOf(q) defer cuda.Recycle(s) return sAverageUniverse(s) } // average of slice over universe func sAverageUniverse(s *data.Slice) []float64 { nCell := float64(prod(s.Size())) avg := make([]float64, s.NComp()) for i := range avg { avg[i] = float64(cuda.Sum(s.Comp(i))) / nCell checkNaN1(avg[i]) } return avg } // average of slice over the magnet volume func sAverageMagnet(s *data.Slice) []float64 { if geometry.Gpu().IsNil() { return sAverageUniverse(s) } else { avg := make([]float64, s.NComp()) for i := range avg { avg[i] = float64(cuda.Dot(s.Comp(i), geometry.Gpu())) / magnetNCell() checkNaN1(avg[i]) } return avg } } // number of cells in the magnet. // not necessarily integer as cells can have fractional volume. func magnetNCell() float64 { if geometry.Gpu().IsNil() { return float64(Mesh().NCell()) } else { return float64(cuda.Sum(geometry.Gpu())) } } 3-3.11.1/engine/backwardeuler.go000066400000000000000000000025011503346766200163600ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Implicit midpoint solver. type BackwardEuler struct { dy1 *data.Slice } // Euler method, can be used as solver.Step. func (s *BackwardEuler) Step() { util.AssertMsg(MaxErr > 0, "Backward euler solver requires MaxErr > 0") t0 := Time y := M.Buffer() y0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(y0) data.Copy(y0, y) dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) if s.dy1 == nil { s.dy1 = cuda.Buffer(VECTOR, y.Size()) } dy1 := s.dy1 Dt_si = FixDt dt := float32(Dt_si * GammaLL) util.AssertMsg(dt > 0, "Backward Euler solver requires fixed time step > 0") // First guess Time = t0 + 0.5*Dt_si // 0.5 dt makes it implicit midpoint method // With temperature, previous torque cannot be used as predictor if Temp.isZero() { cuda.Madd2(y, y0, dy1, 1, dt) // predictor Euler step with previous torque M.normalize() } torqueFn(dy0) cuda.Madd2(y, y0, dy0, 1, dt) // y = y0 + dt * dy M.normalize() // One iteration torqueFn(dy1) cuda.Madd2(y, y0, dy1, 1, dt) // y = y0 + dt * dy1 M.normalize() Time = t0 + Dt_si err := cuda.MaxVecDiff(dy0, dy1) * float64(dt) NSteps++ setLastErr(err) setMaxTorque(dy1) } func (s *BackwardEuler) Free() { s.dy1.Free() s.dy1 = nil } 3-3.11.1/engine/bib.go000066400000000000000000000154061503346766200143110ustar00rootroot00000000000000package engine import ( "io" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) const separationline = ` --------------------------------------------------------------------------- ` const bibheader = ` This bibtex file is automatically generated by Mumax3. The following list are references relevant for your simulation. If you use the results of these simulations in any work or publication, we kindly ask you to cite them.` var ( bibfile io.WriteCloser library map[string]*bibEntry ) func init() { buildLibrary() } func initBib() { // inited in engine.InitIO if bibfile != nil { panic("bib already initialized") } var err error bibfile, err = httpfs.Create(OD() + "references.bib") if err != nil { panic(err) } util.FatalErr(err) fprintln(bibfile, bibheader) fprintln(bibfile, separationline) Refer("vansteenkiste2014") // Make sure that Mumax3 is always referenced } type bibEntry struct { reason string bibtex string shortref string used bool } func Refer(tag string) { bibentry, inLibrary := library[tag] if bibentry.used || !inLibrary { return } bibentry.used = true if bibfile != nil { fprintln(bibfile, bibentry.reason) fprintln(bibfile, bibentry.bibtex) fprintln(bibfile, separationline) } } func areRefsUsed() bool { for _, bibentry := range library { if bibentry.used { return true } } return false } func LogUsedRefs() { if !areRefsUsed() { return } LogOut("********************************************************************//") LogOut("Please cite the following references, relevant for your simulation. //") LogOut("See bibtex file in output folder for justification. //") LogOut("********************************************************************//") for _, bibentry := range library { if bibentry.used { LogOut(" * " + bibentry.shortref) } } } func buildLibrary() { library = make(map[string]*bibEntry) library["vansteenkiste2014"] = &bibEntry{ reason: "Main paper about Mumax3", shortref: "Vansteenkiste et al., AIP Adv. 4, 107133 (2014).", bibtex: ` @article{Vansteenkiste2014, author = {Vansteenkiste, Arne and Leliaert, Jonathan and Dvornik, Mykola and Helsen, Mathias and Garcia-Sanchez, Felipe and {Van Waeyenberge}, Bartel}, title = {{The design and verification of Mumax3}}, journal = {AIP Advances}, number = {10}, pages = {107133}, volume = {4}, year = {2014}, doi = {10.1063/1.4899186}, url = {http://doi.org/10.1063/1.4899186} }`} library["exl2014"] = &bibEntry{ reason: "Mumax3 uses Exl's minimizer", shortref: "Exl et al., J. Appl. Phys. 115, 17D118 (2014).", bibtex: ` @article{Exl2014, author = {Exl, Lukas and Bance, Simon and Reichel, Franz and Schrefl, Thomas and {Peter Stimming}, Hans and Mauser, Norbert J.}, title = {{LaBonte's method revisited: An effective steepest descent method for micromagnetic energy minimization}}, journal = {Journal of Applied Physics}, number = {17}, pages = {17D118}, volume = {115}, year = {2014}, doi = {10.1063/1.4862839}, url = {http://doi.org/10.1063/1.4862839} }`} library["Lel2014"] = &bibEntry{ reason: "Mumax3 used function ext_makegrains", shortref: "Leliaert et al., J. Appl. Phys. 115, 233903 (2014)", bibtex: ` @article{Lel2014, author = {Leliaert, Jonathan and Van de Wiele, Ben and Vansteenkiste, Arne and Laurson, Lasse and Durin, Gianfranco and Dupr{\'e}, Luc and Van Waeyenberge, Bartel}, title = {{Current-driven domain wall mobility in polycrystalline permalloy nanowires: A numerical study}}, journal = {Journal of Applied Physics}, volume = {115}, number = {23}, pages = {233903}, year = {2014}, doi = {10.1063/1.4883297}, url = {http://dx.doi.org/10.1063/1.4883297} }`} library["mulkers2017"] = &bibEntry{ reason: "Simulated system has interfacially induced DMI", shortref: "Mulkers et al., Phys. Rev. B 95, 144401 (2017).", bibtex: ` @article{Mulkers2017, author = {Mulkers, Jeroen and Van Waeyenberge, Bartel and Milo{\v{s}}evi{\'{c}}, Milorad V.}, title = {{Effects of spatially-engineered Dzyaloshinskii-Moriya interaction in ferromagnetic films}}, journal = {Physical Review B}, number = {14}, pages = {144401}, volume = {95}, year = {2017}, doi = {10.1103/PhysRevB.95.144401}, url = {doi.org/10.1103/PhysRevB.95.144401}, }`} library["leliaert2017"] = &bibEntry{ reason: "Simulated nonzero temperatures with adaptive time steps", shortref: "Leliaert et al., AIP Adv. 7, 125010 (2017).", bibtex: ` @article{Leliaert2017, author = {Leliaert, Jonathan and Mulkers, Jeroen and De Clercq, Jonas and Coene, Annelies and Dvornik, Mykola and Van Waeyenberge, Bartel}, title = {{Adaptively time stepping the stochastic Landau-Lifshitz-Gilbert equation at nonzero temperature: implementation and validation in MuMax$^3$}}, journal = {AIP Advances}, number = {12}, pages = {125010}, volume = {7}, year = {2017}, doi = {doi.org/10.1063/1.5003957}, url = {http://aip.scitation.org/doi/10.1063/1.5003957}, }`} library["Berg1981"] = &bibEntry{ reason: "Computed the topological charge using the formula of Berg and Lüscher", shortref: "Berg et al., Nucl. Phys. B 190, 412–24 (1981)", bibtex: ` @article{Berg1981, author = {Berg, Bernd A Lüscher, Martin}, title = {{Definition and statistical distributions of a topological number in the lattice O(3) $\sigma$-model}}, journal = {Nuclear Physics B}, pages = {412-424}, volume = {190}, year = {1981}, doi = {doi.org/10.1016/0550-3213(81)90568-X}, url = {https://doi.org/10.1016/0550-3213(81)90568-X}, }`} library["Knapman2025"] = &bibEntry{ reason: "Computed the Hopf index", shortref: "Knapman et al., Phys. Rev. B 111, 134408 (2025)", bibtex: ` @article{Knapman2025, author = {Knapman, R. and Azhar, M. and Pignedoli, A. and Gallard, L. and Hertel, R. and Leliaert, J. and Everschor-Sitte, K.}, title = {{Numerical calculation of the Hopf index for three-dimensional magnetic textures}}, journal = {Phys. Rev. B}, pages = {134408}, volume = {111}, year = {2025}, doi = {10.1103/PhysRevB.111.134408}, url = {https://link.aps.org/doi/10.1103/PhysRevB.111.134408}, }`} } 3-3.11.1/engine/comp.go000066400000000000000000000020761503346766200145120ustar00rootroot00000000000000package engine // Comp is a Derived Quantity pointing to a single component of vector Quantity import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) type component struct { parent Quantity comp int } // Comp returns vector component c of the parent Quantity func Comp(parent Quantity, c int) ScalarField { util.Argument(c >= 0 && c < parent.NComp()) return AsScalarField(&component{parent, c}) } func (q *component) NComp() int { return 1 } func (q *component) Name() string { return fmt.Sprint(NameOf(q.parent), "_", compname[q.comp]) } func (q *component) Unit() string { return UnitOf(q.parent) } func (q *component) Mesh() *data.Mesh { return MeshOf(q.parent) } func (q *component) Slice() (*data.Slice, bool) { p := q.parent src := ValueOf(p) defer cuda.Recycle(src) c := cuda.Buffer(1, src.Size()) return c, true } func (q *component) EvalTo(dst *data.Slice) { src := ValueOf(q.parent) defer cuda.Recycle(src) data.Copy(dst, src.Comp(q.comp)) } var compname = map[int]string{0: "x", 1: "y", 2: "z"} 3-3.11.1/engine/config.go000066400000000000000000000202611503346766200150150ustar00rootroot00000000000000package engine // Utilities for setting magnetic configurations. import ( "github.com/mumax/3/data" "math" "math/rand" ) func init() { DeclFunc("Uniform", Uniform, "Uniform magnetization in given direction") DeclFunc("Vortex", Vortex, "Vortex magnetization with given circulation and core polarization") DeclFunc("Antivortex", AntiVortex, "Antivortex magnetization with given circulation and core polarization") DeclFunc("NeelSkyrmion", NeelSkyrmion, "Néél skyrmion magnetization with given charge and core polarization") DeclFunc("BlochSkyrmion", BlochSkyrmion, "Bloch skyrmion magnetization with given chirality and core polarization") DeclFunc("TwoDomain", TwoDomain, "Twodomain magnetization with with given magnetization in left domain, wall, and right domain") DeclFunc("VortexWall", VortexWall, "Vortex wall magnetization with given mx in left and right domain and core circulation and polarization") DeclFunc("RandomMag", RandomMag, "Random magnetization") DeclFunc("RandomMagSeed", RandomMagSeed, "Random magnetization with given seed") DeclFunc("Conical", Conical, "Conical state for given wave vector, cone direction, and cone angle") DeclFunc("Helical", Helical, "Helical state for given wave vector") DeclFunc("HopfionCompactSupport", HopfionCompactSupport, "Hopfion texture from skyrmion, with compact support (smooth and magnetization exactly along z-axis outside of finite region)") } // Magnetic configuration returns m vector for position (x,y,z) type Config func(x, y, z float64) data.Vector // Random initial magnetization. func RandomMag() Config { return RandomMagSeed(0) } // Random initial magnetization, // generated from random seed. func RandomMagSeed(seed int) Config { rng := rand.New(rand.NewSource(int64(seed))) return func(x, y, z float64) data.Vector { return randomDir(rng) } } // generate anisotropic random unit vector func randomDir(rng *rand.Rand) data.Vector { theta := 2 * rng.Float64() * math.Pi z := 2 * (rng.Float64() - 0.5) b := math.Sqrt(1 - z*z) x := b * math.Cos(theta) y := b * math.Sin(theta) return data.Vector{x, y, z} } // Returns a uniform magnetization state. E.g.: // // M = Uniform(1, 0, 0)) // saturated along X func Uniform(mx, my, mz float64) Config { return func(x, y, z float64) data.Vector { return data.Vector{mx, my, mz} } } // Make a vortex magnetization with given circulation and core polarization (+1 or -1). // The core is smoothed over a few exchange lengths and should easily relax to its ground state. func Vortex(circ, pol int) Config { diam2 := 2 * sqr64(Mesh().CellSize()[X]) return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mx := -y * float64(circ) / r my := x * float64(circ) / r mz := 1.5 * float64(pol) * math.Exp(-r2/diam2) return noNaN(data.Vector{mx, my, mz}, pol) } } func NeelSkyrmion(charge, pol int) Config { w := 8 * Mesh().CellSize()[X] w2 := w * w return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mz := 2 * float64(pol) * (math.Exp(-r2/w2) - 0.5) mx := (x * float64(charge) / r) * (1 - math.Abs(mz)) my := (y * float64(charge) / r) * (1 - math.Abs(mz)) return noNaN(data.Vector{mx, my, mz}, pol) } } func BlochSkyrmion(charge, pol int) Config { w := 8 * Mesh().CellSize()[X] w2 := w * w return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mz := 2 * float64(pol) * (math.Exp(-r2/w2) - 0.5) mx := (-y * float64(charge) / r) * (1 - math.Abs(mz)) my := (x * float64(charge) / r) * (1 - math.Abs(mz)) return noNaN(data.Vector{mx, my, mz}, pol) } } func AntiVortex(circ, pol int) Config { diam2 := 2 * sqr64(Mesh().CellSize()[X]) return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mx := -x * float64(circ) / r my := y * float64(circ) / r mz := 1.5 * float64(pol) * math.Exp(-r2/diam2) return noNaN(data.Vector{mx, my, mz}, pol) } } // Make a vortex wall configuration. func VortexWall(mleft, mright float64, circ, pol int) Config { h := Mesh().WorldSize()[Y] v := Vortex(circ, pol) return func(x, y, z float64) data.Vector { if x < -h/2 { return data.Vector{mleft, 0, 0} } if x > h/2 { return data.Vector{mright, 0, 0} } return v(x, y, z) } } func noNaN(v data.Vector, pol int) data.Vector { if math.IsNaN(v[X]) || math.IsNaN(v[Y]) || math.IsNaN(v[Z]) { return data.Vector{0, 0, float64(pol)} } else { return v } } // Make a 2-domain configuration with domain wall. // (mx1, my1, mz1) and (mx2, my2, mz2) are the magnetizations in the left and right domain, respectively. // (mxwall, mywall, mzwall) is the magnetization in the wall. The wall is smoothed over a few cells so it will // easily relax to its ground state. // E.g.: // // TwoDomain(1,0,0, 0,1,0, -1,0,0) // head-to-head domains with transverse (Néel) wall // TwoDomain(1,0,0, 0,0,1, -1,0,0) // head-to-head domains with perpendicular (Bloch) wall // TwoDomain(0,0,1, 1,0,0, 0,0,-1)// up-down domains with Bloch wall func TwoDomain(mx1, my1, mz1, mxwall, mywall, mzwall, mx2, my2, mz2 float64) Config { ww := 2 * Mesh().CellSize()[X] // wall width in cells return func(x, y, z float64) data.Vector { var m data.Vector if x < 0 { m = data.Vector{mx1, my1, mz1} } else { m = data.Vector{mx2, my2, mz2} } gauss := math.Exp(-sqr64(x / ww)) m[X] = (1-gauss)*m[X] + gauss*mxwall m[Y] = (1-gauss)*m[Y] + gauss*mywall m[Z] = (1-gauss)*m[Z] + gauss*mzwall return m } } // Conical magnetization configuration. // The magnetization rotates on a cone defined by coneAngle and coneDirection. // q is the wave vector of the conical magnetization configuration. // The magnetization is // // m = u*cos(coneAngle) + sin(coneAngle)*( ua*cos(q*r) + ub*sin(q*r) ) // // with ua and ub unit vectors perpendicular to u (normalized coneDirection) func Conical(q, coneDirection data.Vector, coneAngle float64) Config { u := coneDirection.Div(coneDirection.Len()) // two unit vectors perpendicular to each other and to the cone direction u p := math.Sqrt(1 - u[Z]*u[Z]) ua := data.Vector{u[X] * u[Z], u[Y] * u[Z], u[Z]*u[Z] - 1}.Div(p) ub := data.Vector{-u[Y], u[X], 0}.Div(p) // cone direction along z direction? -> oops devided by zero, let's fix this if u[Z]*u[Z] == 1 { ua = data.Vector{1, 0, 0} ub = data.Vector{0, 1, 0} } sina, cosa := math.Sincos(coneAngle) return func(x, y, z float64) data.Vector { sinqr, cosqr := math.Sincos(q[X]*x + q[Y]*y + q[Z]*z) return u.Mul(cosa).MAdd(sina*cosqr, ua).MAdd(sina*sinqr, ub) } } func Helical(q data.Vector) Config { return Conical(q, q, math.Pi/2) } func HopfionCompactSupport(major_radius, minor_radius float64) Config { return func(x, y, z float64) data.Vector { psi := math.Atan2(y, x) rho := math.Sqrt(math.Pow(z, 2) + math.Pow(x*math.Cos(psi)+y*math.Sin(psi)-major_radius, 2)) Theta := 0.0 Phi := 0.0 if rho < minor_radius { phi := math.Atan2(z, x*math.Cos(psi)+y*math.Sin(psi)-major_radius) Phi = -phi + psi Theta = math.Pi * math.Exp(1.0-1.0/(1.0-math.Pow(rho/minor_radius, 2))) } mx := math.Cos(Phi) * math.Sin(Theta) my := math.Sin(Phi) * math.Sin(Theta) mz := math.Cos(Theta) return data.Vector{mx, my, mz} } } // Transl returns a translated copy of configuration c. E.g.: // // M = Vortex(1, 1).Transl(100e-9, 0, 0) // vortex with center at x=100nm func (c Config) Transl(dx, dy, dz float64) Config { return func(x, y, z float64) data.Vector { return c(x-dx, y-dy, z-dz) } } // Scale returns a scaled copy of configuration c. func (c Config) Scale(sx, sy, sz float64) Config { return func(x, y, z float64) data.Vector { return c(x/sx, y/sy, z/sz) } } // Rotates the configuration around the Z-axis, over θ radians. func (c Config) RotZ(θ float64) Config { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) data.Vector { x_ := x*cos + y*sin y_ := -x*sin + y*cos m := c(x_, y_, z) mx_ := m[X]*cos - m[Y]*sin my_ := m[X]*sin + m[Y]*cos return data.Vector{mx_, my_, m[Z]} } } // Returns a new magnetization equal to c + weight * other. // E.g.: // // Uniform(1, 0, 0).Add(0.2, RandomMag()) // // for a uniform state with 20% random distortion. func (c Config) Add(weight float64, other Config) Config { return func(x, y, z float64) data.Vector { return c(x, y, z).MAdd(weight, other(x, y, z)) } } 3-3.11.1/engine/crop.go000066400000000000000000000071631503346766200145210ustar00rootroot00000000000000package engine // Cropped quantity refers to a cut-out piece of a large quantity import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func init() { DeclFunc("Crop", Crop, "Crops a quantity to cell ranges [x1,x2[, [y1,y2[, [z1,z2[") DeclFunc("CropX", CropX, "Crops a quantity to cell ranges [x1,x2[") DeclFunc("CropY", CropY, "Crops a quantity to cell ranges [y1,y2[") DeclFunc("CropZ", CropZ, "Crops a quantity to cell ranges [z1,z2[") DeclFunc("CropLayer", CropLayer, "Crops a quantity to a single layer") DeclFunc("CropRegion", CropRegion, "Crops a quantity to a region") } type cropped struct { parent Quantity name string x1, x2, y1, y2, z1, z2 int } // Crop quantity to a box enclosing the given region. // Used to output a region of interest, even if the region is non-rectangular. func CropRegion(parent Quantity, region int) *cropped { n := MeshOf(parent).Size() // use -1 for unset values x1, y1, z1 := -1, -1, -1 x2, y2, z2 := -1, -1, -1 r := regions.HostArray() for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { if r[iz][iy][ix] == byte(region) { // initialize all indices if unset if x1 == -1 { x1, y1, z1 = ix, iy, iz x2, y2, z2 = ix, iy, iz } if ix < x1 { x1 = ix } if iy < y1 { y1 = iy } if iz < z1 { z1 = iz } if ix > x2 { x2 = ix } if iy > y2 { y2 = iy } if iz > z2 { z2 = iz } } } } } return Crop(parent, x1, x2+1, y1, y2+1, z1, z2+1) } func CropLayer(parent Quantity, layer int) *cropped { n := MeshOf(parent).Size() return Crop(parent, 0, n[X], 0, n[Y], layer, layer+1) } func CropX(parent Quantity, x1, x2 int) *cropped { n := MeshOf(parent).Size() return Crop(parent, x1, x2, 0, n[Y], 0, n[Z]) } func CropY(parent Quantity, y1, y2 int) *cropped { n := MeshOf(parent).Size() return Crop(parent, 0, n[X], y1, y2, 0, n[Z]) } func CropZ(parent Quantity, z1, z2 int) *cropped { n := MeshOf(parent).Size() return Crop(parent, 0, n[X], 0, n[Y], z1, z2) } func Crop(parent Quantity, x1, x2, y1, y2, z1, z2 int) *cropped { n := MeshOf(parent).Size() util.Argument(x1 < x2 && y1 < y2 && z1 < z2) util.Argument(x1 >= 0 && y1 >= 0 && z1 >= 0) util.Argument(x2 <= n[X] && y2 <= n[Y] && z2 <= n[Z]) name := NameOf(parent) + "_" if x1 != 0 || x2 != n[X] { name += "xrange" + rangeStr(x1, x2) } if y1 != 0 || y2 != n[Y] { name += "yrange" + rangeStr(y1, y2) } if z1 != 0 || z2 != n[Z] { name += "zrange" + rangeStr(z1, z2) } return &cropped{parent, name, x1, x2, y1, y2, z1, z2} } func rangeStr(a, b int) string { if a+1 == b { return fmt.Sprint(a, "_") } else { return fmt.Sprint(a, "-", b, "_") } // (trailing underscore to separate from subsequent autosave number) } func (q *cropped) NComp() int { return q.parent.NComp() } func (q *cropped) Name() string { return q.name } func (q *cropped) Unit() string { return UnitOf(q.parent) } func (q *cropped) EvalTo(dst *data.Slice) { EvalTo(q, dst) } func (q *cropped) Mesh() *data.Mesh { c := MeshOf(q.parent).CellSize() return data.NewMesh(q.x2-q.x1, q.y2-q.y1, q.z2-q.z1, c[X], c[Y], c[Z]) } func (q *cropped) average() []float64 { return qAverageUniverse(q) } // needed for table func (q *cropped) Average() []float64 { return q.average() } // handy for script func (q *cropped) Slice() (*data.Slice, bool) { src := ValueOf(q.parent) defer cuda.Recycle(src) dst := cuda.Buffer(q.NComp(), q.Mesh().Size()) cuda.Crop(dst, src, q.x1, q.y1, q.z1) return dst, true } 3-3.11.1/engine/customfield.go000066400000000000000000000320771503346766200160760ustar00rootroot00000000000000package engine // Add arbitrary terms to B_eff, Edens_total. import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( B_custom = NewVectorField("B_custom", "T", "User-defined field", AddCustomField) Edens_custom = NewScalarField("Edens_custom", "J/m3", "Energy density of user-defined field.", AddCustomEnergyDensity) E_custom = NewScalarValue("E_custom", "J", "total energy of user-defined field", GetCustomEnergy) customTerms []Quantity // vector customEnergies []Quantity // scalar ) func init() { registerEnergy(GetCustomEnergy, AddCustomEnergyDensity) DeclFunc("AddFieldTerm", AddFieldTerm, "Add an expression to B_eff.") DeclFunc("AddEdensTerm", AddEdensTerm, "Add an expression to Edens.") DeclFunc("Add", Add, "Add two quantities") DeclFunc("Madd", Madd, "Weighted addition: Madd(Q1,Q2,c1,c2) = c1*Q1 + c2*Q2") DeclFunc("Dot", Dot, "Dot product of two vector quantities") DeclFunc("Cross", Cross, "Cross product of two vector quantities") DeclFunc("Mul", Mul, "Point-wise product of two quantities") DeclFunc("MulMV", MulMV, "Matrix-Vector product: MulMV(AX, AY, AZ, m) = (AX·m, AY·m, AZ·m). "+ "The arguments Ax, Ay, Az and m are quantities with 3 componets.") DeclFunc("Div", Div, "Point-wise division of two quantities") DeclFunc("Const", Const, "Constant, uniform number") DeclFunc("ConstVector", ConstVector, "Constant, uniform vector") DeclFunc("Shifted", Shifted, "Shifted quantity") DeclFunc("Masked", Masked, "Mask quantity with shape") DeclFunc("Normalized", Normalized, "Normalize quantity") DeclFunc("RemoveCustomFields", RemoveCustomFields, "Removes all custom fields again") DeclFunc("RemoveCustomEnergies", RemoveCustomEnergies, "Removes all custom energies") DeclFunc("RunningAverage", RunningAverage, "Records the time-average of a quantity from the moment this function is called.
    Note: this may impact performance since the Quantity will be evaluated after every step.") DeclFunc("Sum", Sum, "Sum of Quantity over all cells in the grid. For a vector Quantity, all components are added together.") DeclFunc("SumVector", SumVector, "Sum of vector Quantity over all cells in the grid.") } // Removes all customfields func RemoveCustomFields() { customTerms = nil } // Removes all customenergies func RemoveCustomEnergies() { customEnergies = nil } // AddFieldTerm adds an effective field function (returning Teslas) to B_eff. // Be sure to also add the corresponding energy term using AddEnergyTerm. func AddFieldTerm(b Quantity) { customTerms = append(customTerms, b) } // AddEnergyTerm adds an energy density function (returning Joules/m³) to Edens_total. // Needed when AddFieldTerm was used and a correct energy is needed // (e.g. for Relax, Minimize, ...). func AddEdensTerm(e Quantity) { customEnergies = append(customEnergies, e) } // AddCustomField evaluates the user-defined custom field terms // and adds the result to dst. func AddCustomField(dst *data.Slice) { for _, term := range customTerms { buf := ValueOf(term) cuda.Add(dst, dst, buf) cuda.Recycle(buf) } } // Adds the custom energy densities (defined with AddEdensTerm) func AddCustomEnergyDensity(dst *data.Slice) { for _, term := range customEnergies { buf := ValueOf(term) cuda.Add(dst, dst, buf) cuda.Recycle(buf) } } func GetCustomEnergy() float64 { buf := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(buf) cuda.Zero(buf) AddCustomEnergyDensity(buf) return cellVolume() * float64(cuda.Sum(buf)) } type constValue struct { value []float64 } func (c *constValue) NComp() int { return len(c.value) } func (d *constValue) EvalTo(dst *data.Slice) { for c, v := range d.value { cuda.Memset(dst.Comp(c), float32(v)) } } // Const returns a constant (uniform) scalar quantity, // that can be used to construct custom field terms. func Const(v float64) Quantity { return &constValue{[]float64{v}} } // ConstVector returns a constant (uniform) vector quantity, // that can be used to construct custom field terms. func ConstVector(x, y, z float64) Quantity { return &constValue{[]float64{x, y, z}} } // fieldOp holds the abstract functionality for operations // (like add, multiply, ...) on space-dependent quantites // (like M, B_sat, ...) type fieldOp struct { a, b Quantity nComp int } func (o fieldOp) NComp() int { return o.nComp } type dotProduct struct { fieldOp } type crossProduct struct { fieldOp } type addition struct { fieldOp } type mAddition struct { fieldOp fac1, fac2 float64 } type mulmv struct { ax, ay, az, b Quantity } // MulMV returns a new Quantity that evaluates to the // matrix-vector product (Ax·b, Ay·b, Az·b). func MulMV(Ax, Ay, Az, b Quantity) Quantity { util.Argument(Ax.NComp() == 3 && Ay.NComp() == 3 && Az.NComp() == 3 && b.NComp() == 3) return &mulmv{Ax, Ay, Az, b} } func (q *mulmv) EvalTo(dst *data.Slice) { util.Argument(dst.NComp() == 3) cuda.Zero(dst) b := ValueOf(q.b) defer cuda.Recycle(b) { Ax := ValueOf(q.ax) cuda.AddDotProduct(dst.Comp(X), 1, Ax, b) cuda.Recycle(Ax) } { Ay := ValueOf(q.ay) cuda.AddDotProduct(dst.Comp(Y), 1, Ay, b) cuda.Recycle(Ay) } { Az := ValueOf(q.az) cuda.AddDotProduct(dst.Comp(Z), 1, Az, b) cuda.Recycle(Az) } } func (q *mulmv) NComp() int { return 3 } // DotProduct creates a new quantity that is the dot product of // quantities a and b. E.g.: // // DotProct(&M, &B_ext) func Dot(a, b Quantity) Quantity { return &dotProduct{fieldOp{a, b, 1}} } func (d *dotProduct) EvalTo(dst *data.Slice) { A := ValueOf(d.a) defer cuda.Recycle(A) B := ValueOf(d.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.AddDotProduct(dst, 1, A, B) } // CrossProduct creates a new quantity that is the cross product of // quantities a and b. E.g.: // // CrossProct(&M, &B_ext) func Cross(a, b Quantity) Quantity { return &crossProduct{fieldOp{a, b, 3}} } func (d *crossProduct) EvalTo(dst *data.Slice) { A := ValueOf(d.a) defer cuda.Recycle(A) B := ValueOf(d.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.CrossProduct(dst, A, B) } func Add(a, b Quantity) Quantity { if a.NComp() != b.NComp() { panic(fmt.Sprintf("Cannot point-wise Add %v components by %v components", a.NComp(), b.NComp())) } return &addition{fieldOp{a, b, a.NComp()}} } func (d *addition) EvalTo(dst *data.Slice) { A := ValueOf(d.a) defer cuda.Recycle(A) B := ValueOf(d.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.Add(dst, A, B) } type pointwiseMul struct { fieldOp } func Madd(a, b Quantity, fac1, fac2 float64) *mAddition { if a.NComp() != b.NComp() { panic(fmt.Sprintf("Cannot point-wise add %v components by %v components", a.NComp(), b.NComp())) } return &mAddition{fieldOp{a, b, a.NComp()}, fac1, fac2} } func (o *mAddition) EvalTo(dst *data.Slice) { A := ValueOf(o.a) defer cuda.Recycle(A) B := ValueOf(o.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.Madd2(dst, A, B, float32(o.fac1), float32(o.fac2)) } // Mul returns a new quantity that evaluates to the pointwise product a and b. func Mul(a, b Quantity) Quantity { nComp := -1 switch { case a.NComp() == b.NComp(): nComp = a.NComp() // vector*vector, scalar*scalar case a.NComp() == 1: nComp = b.NComp() // scalar*something case b.NComp() == 1: nComp = a.NComp() // something*scalar default: panic(fmt.Sprintf("Cannot point-wise multiply %v components by %v components", a.NComp(), b.NComp())) } return &pointwiseMul{fieldOp{a, b, nComp}} } func (d *pointwiseMul) EvalTo(dst *data.Slice) { cuda.Zero(dst) a := ValueOf(d.a) defer cuda.Recycle(a) b := ValueOf(d.b) defer cuda.Recycle(b) switch { case a.NComp() == b.NComp(): mulNN(dst, a, b) // vector*vector, scalar*scalar case a.NComp() == 1: mul1N(dst, a, b) case b.NComp() == 1: mul1N(dst, b, a) default: panic(fmt.Sprintf("Cannot point-wise multiply %v components by %v components", a.NComp(), b.NComp())) } } // mulNN pointwise multiplies two N-component vectors, // yielding an N-component vector stored in dst. func mulNN(dst, a, b *data.Slice) { cuda.Mul(dst, a, b) } // mul1N pointwise multiplies a scalar (1-component) with an N-component vector, // yielding an N-component vector stored in dst. func mul1N(dst, a, b *data.Slice) { util.Assert(a.NComp() == 1) util.Assert(dst.NComp() == b.NComp()) for c := 0; c < dst.NComp(); c++ { cuda.Mul(dst.Comp(c), a, b.Comp(c)) } } type pointwiseDiv struct { fieldOp } // Div returns a new quantity that evaluates to the pointwise product a and b. func Div(a, b Quantity) Quantity { nComp := -1 switch { case a.NComp() == b.NComp(): nComp = a.NComp() // vector/vector, scalar/scalar case b.NComp() == 1: nComp = a.NComp() // something/scalar default: panic(fmt.Sprintf("Cannot point-wise divide %v components by %v components", a.NComp(), b.NComp())) } return &pointwiseDiv{fieldOp{a, b, nComp}} } func (d *pointwiseDiv) EvalTo(dst *data.Slice) { a := ValueOf(d.a) defer cuda.Recycle(a) b := ValueOf(d.b) defer cuda.Recycle(b) switch { case a.NComp() == b.NComp(): divNN(dst, a, b) // vector*vector, scalar*scalar case b.NComp() == 1: divN1(dst, a, b) default: panic(fmt.Sprintf("Cannot point-wise divide %v components by %v components", a.NComp(), b.NComp())) } } func divNN(dst, a, b *data.Slice) { cuda.Div(dst, a, b) } func divN1(dst, a, b *data.Slice) { util.Assert(dst.NComp() == a.NComp()) util.Assert(b.NComp() == 1) for c := 0; c < dst.NComp(); c++ { cuda.Div(dst.Comp(c), a.Comp(c), b) } } type shifted struct { orig Quantity dx, dy, dz int } // Shifted returns a new Quantity that evaluates to // the original, shifted over dx, dy, dz cells. func Shifted(q Quantity, dx, dy, dz int) Quantity { util.Assert(dx != 0 || dy != 0 || dz != 0) return &shifted{q, dx, dy, dz} } func (q *shifted) EvalTo(dst *data.Slice) { orig := ValueOf(q.orig) defer cuda.Recycle(orig) for i := 0; i < q.NComp(); i++ { dsti := dst.Comp(i) origi := orig.Comp(i) if q.dx != 0 { cuda.ShiftX(dsti, origi, q.dx, 0, 0) data.Copy(origi, dsti) } if q.dy != 0 { cuda.ShiftY(dsti, origi, q.dy, 0, 0) data.Copy(origi, dsti) } if q.dz != 0 { cuda.ShiftZ(dsti, origi, q.dz, 0, 0) } } } func (q *shifted) NComp() int { return q.orig.NComp() } // Masks a quantity with a shape // The shape will only be evaluated once on the mesh, // and will be re-evaluated after mesh change, // because otherwise too slow func Masked(q Quantity, shape Shape) Quantity { return &masked{q, shape, nil, data.Mesh{}} } type masked struct { orig Quantity shape Shape mask *data.Slice mesh data.Mesh } func (q *masked) EvalTo(dst *data.Slice) { if q.mesh != *Mesh() { // When mesh is changed, mask needs an update q.createMask() } orig := ValueOf(q.orig) defer cuda.Recycle(orig) mul1N(dst, q.mask, orig) } func (q *masked) NComp() int { return q.orig.NComp() } func (q *masked) createMask() { size := Mesh().Size() // Prepare mask on host maskhost := data.NewSlice(SCALAR, size) defer maskhost.Free() maskScalars := maskhost.Scalars() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { r := Index2Coord(ix, iy, iz) if q.shape(r[X], r[Y], r[Z]) { maskScalars[iz][iy][ix] = 1 } } } } // Update mask q.mask.Free() q.mask = cuda.NewSlice(SCALAR, size) data.Copy(q.mask, maskhost) q.mesh = *Mesh() // Remove mask from host } // Normalized returns a quantity that evaluates to the unit vector of q func Normalized(q Quantity) Quantity { return &normalized{q} } type normalized struct { orig Quantity } func (q *normalized) NComp() int { return 3 } func (q *normalized) EvalTo(dst *data.Slice) { util.Assert(dst.NComp() == q.NComp()) q.orig.EvalTo(dst) cuda.Normalize(dst, nil) } // RunningAverage returns the running average of a quantity // over time, starting at the moment RunningAverage() is called. // This value is updated after every Step() and depends on the time step. func RunningAverage(q Quantity) Quantity { ra := runningAverage{q, nil, Time, 0} ra.avg = cuda.Buffer(q.NComp(), SizeOf(q)) cuda.Zero(ra.avg) PostStep(func() { dt := Time - ra.prev_t if dt < 0 { // Don't update the time average if we went back in time since the last step return } ra.prev_t = Time ra.total_t += dt val := ValueOf(q) defer cuda.Recycle(val) cuda.Madd2(ra.avg, ra.avg, val, float32((ra.total_t-dt)/ra.total_t), float32(dt/ra.total_t)) }) return &ra } type runningAverage struct { orig Quantity avg *data.Slice prev_t float64 total_t float64 } func (ra *runningAverage) EvalTo(dst *data.Slice) { util.Assert(dst.NComp() == ra.NComp()) data.Copy(dst, ra.avg) } func (ra *runningAverage) NComp() int { return ra.orig.NComp() } // Sum of Quantity over all cells in the grid. // For a vector Quantity, all components are added together. func Sum(q Quantity) float64 { val := ValueOf(q) defer cuda.Recycle(val) total := 0. for i := 0; i < q.NComp(); i++ { total += float64(cuda.Sum(val.Comp(i))) } return total } // Sum of vector Quantity over all cells in the grid. func SumVector(q Quantity) data.Vector { util.Assert(q.NComp() == 3) val := ValueOf(q) defer cuda.Recycle(val) var v [3]float64 for i := 0; i < 3; i++ { v[i] = float64(cuda.Sum(val.Comp(i))) } return Vector(v[0], v[1], v[2]) } 3-3.11.1/engine/demag.go000066400000000000000000000070771503346766200146370ustar00rootroot00000000000000package engine // Calculation of magnetostatic field import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/mag" ) // Demag variables var ( Msat = NewScalarParam("Msat", "A/m", "Saturation magnetization", &lex2, &din2, &dbulk2) M_full = NewVectorField("m_full", "A/m", "Unnormalized magnetization", SetMFull) B_demag = NewVectorField("B_demag", "T", "Magnetostatic field", SetDemagField) Edens_demag = NewScalarField("Edens_demag", "J/m3", "Magnetostatic energy density", AddEdens_demag) E_demag = NewScalarValue("E_demag", "J", "Magnetostatic energy", GetDemagEnergy) EnableDemag = true // enable/disable global demag field NoDemagSpins = NewScalarParam("NoDemagSpins", "", "Disable magnetostatic interaction per region (default=0, set to 1 to disable). "+ "E.g.: NoDemagSpins.SetRegion(5, 1) disables the magnetostatic interaction in region 5.") conv_ *cuda.DemagConvolution // does the heavy lifting DemagAccuracy = 6.0 // Demag accuracy (divide cubes in at most N^3 points) ) var AddEdens_demag = makeEdensAdder(&B_demag, -0.5) func init() { DeclVar("EnableDemag", &EnableDemag, "Enables/disables demag (default=true)") DeclVar("DemagAccuracy", &DemagAccuracy, "Controls accuracy of demag kernel") registerEnergy(GetDemagEnergy, AddEdens_demag) } // Sets dst to the current demag field func SetDemagField(dst *data.Slice) { if EnableDemag { msat := Msat.MSlice() defer msat.Recycle() if NoDemagSpins.isZero() && !Msat.hasZero() { // Normal demag, everywhere demagConv().Exec(dst, M.Buffer(), geometry.Gpu(), msat) } else { setMaskedDemagField(dst, msat) } } else { cuda.Zero(dst) // will ADD other terms to it } } // Sets dst to the demag field, but cells where NoDemagSpins != 0 do not generate nor recieve field. func setMaskedDemagField(dst *data.Slice, msat cuda.MSlice) { // No-demag spins: mask-out geometry with zeros where NoDemagSpins is set, // so these spins do not generate a field buf := cuda.Buffer(SCALAR, geometry.Gpu().Size()) // masked-out geometry defer cuda.Recycle(buf) // obtain a copy of the geometry mask, which we can overwrite geom, r := geometry.Slice() if r { defer cuda.Recycle(geom) } data.Copy(buf, geom) // mask-out cuda.ZeroMask(buf, NoDemagSpins.gpuLUT1(), regions.Gpu()) // convolution with masked-out cells. demagConv().Exec(dst, M.Buffer(), buf, msat) // After convolution, mask-out the field in the NoDemagSpins or Msat=0 cells // so they don't feel the field generated by others. cuda.ZeroMask(dst, NoDemagSpins.gpuLUT1(), regions.Gpu()) cuda.ZeroMaskInv(dst, Msat.gpuLUT1(), regions.Gpu()) } // Sets dst to the full (unnormalized) magnetization in A/m func SetMFull(dst *data.Slice) { // scale m by Msat... msat, rM := Msat.Slice() if rM { defer cuda.Recycle(msat) } for c := 0; c < 3; c++ { cuda.Mul(dst.Comp(c), M.Buffer().Comp(c), msat) } // ...and by cell volume if applicable vol, rV := geometry.Slice() if rV { defer cuda.Recycle(vol) } if !vol.IsNil() { for c := 0; c < 3; c++ { cuda.Mul(dst.Comp(c), dst.Comp(c), vol) } } } // returns demag convolution, making sure it's initialized func demagConv() *cuda.DemagConvolution { if conv_ == nil { SetBusy(true) defer SetBusy(false) kernel := mag.DemagKernel(Mesh().Size(), Mesh().PBC(), Mesh().CellSize(), DemagAccuracy, *Flag_cachedir) conv_ = cuda.NewDemag(Mesh().Size(), Mesh().PBC(), kernel, *Flag_selftest) } return conv_ } // Returns the current demag energy in Joules. func GetDemagEnergy() float64 { return -0.5 * cellVolume() * dot(&M_full, &B_demag) } 3-3.11.1/engine/effectivefield.go000066400000000000000000000010511503346766200165100ustar00rootroot00000000000000package engine // Effective field import "github.com/mumax/3/data" var B_eff = NewVectorField("B_eff", "T", "Effective field", SetEffectiveField) // Sets dst to the current effective field, in Tesla. // This is the sum of all effective field terms, // like demag, exchange, ... func SetEffectiveField(dst *data.Slice) { SetDemagField(dst) // set to B_demag... AddExchangeField(dst) // ...then add other terms AddAnisotropyField(dst) AddMagnetoelasticField(dst) B_ext.AddTo(dst) if !relaxing { B_therm.AddTo(dst) } AddCustomField(dst) } 3-3.11.1/engine/energy.go000066400000000000000000000032151503346766200150410ustar00rootroot00000000000000package engine // Total energy calculation import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // TODO: Integrate(Edens) // TODO: consistent naming SetEdensTotal, ... var ( energyTerms []func() float64 // all contributions to total energy edensTerms []func(dst *data.Slice) // all contributions to total energy density (add to dst) Edens_total = NewScalarField("Edens_total", "J/m3", "Total energy density", SetTotalEdens) E_total = NewScalarValue("E_total", "J", "total energy", GetTotalEnergy) ) // add energy term to global energy func registerEnergy(term func() float64, dens func(*data.Slice)) { energyTerms = append(energyTerms, term) edensTerms = append(edensTerms, dens) } // Returns the total energy in J. func GetTotalEnergy() float64 { E := 0. for _, f := range energyTerms { E += f() } checkNaN1(E) return E } // Set dst to total energy density in J/m3 func SetTotalEdens(dst *data.Slice) { cuda.Zero(dst) for _, addTerm := range edensTerms { addTerm(dst) } } // volume of one cell in m3 func cellVolume() float64 { c := Mesh().CellSize() return c[0] * c[1] * c[2] } // returns a function that adds to dst the energy density: // // prefactor * dot (M_full, field) func makeEdensAdder(field Quantity, prefactor float64) func(*data.Slice) { return func(dst *data.Slice) { B := ValueOf(field) defer cuda.Recycle(B) m := ValueOf(M_full) defer cuda.Recycle(m) factor := float32(prefactor) cuda.AddDotProduct(dst, factor, B, m) } } // vector dot product func dot(a, b Quantity) float64 { A := ValueOf(a) defer cuda.Recycle(A) B := ValueOf(b) defer cuda.Recycle(B) return float64(cuda.Dot(A, B)) } 3-3.11.1/engine/engine.go000066400000000000000000000023221503346766200150130ustar00rootroot00000000000000/* engine does the simulation bookkeeping, I/O and GUI. space-dependence: value: space-independent param: region-dependent parameter (always input) field: fully space-dependent field TODO: godoc everything */ package engine import ( "fmt" "os" "runtime" "sync" "time" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" ) const VERSION = "mumax 3.11.1" var UNAME = fmt.Sprintf("%s [%s_%s %s(%s) CUDA-%d.%d]", VERSION, runtime.GOOS, runtime.GOARCH, runtime.Version(), runtime.Compiler, cu.CUDA_VERSION/1000, (cu.CUDA_VERSION%1000)/10) var StartTime = time.Now() var ( busyLock sync.Mutex busy bool // are we so busy we can't respond from run loop? (e.g. calc kernel) ) // We set SetBusy(true) when the simulation is too busy to accept GUI input on Inject channel. // E.g. during kernel init. func SetBusy(b bool) { busyLock.Lock() defer busyLock.Unlock() busy = b } func GetBusy() bool { busyLock.Lock() defer busyLock.Unlock() return busy } // Cleanly exits the simulation, assuring all output is flushed. func Close() { drainOutput() LogUsedRefs() Table.flush() if logfile != nil { logfile.Close() } if bibfile != nil { bibfile.Close() } if *Flag_sync { timer.Print(os.Stdout) } } 3-3.11.1/engine/euler.go000066400000000000000000000014551503346766200146700ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/util" ) type Euler struct{} // Euler method, can be used as solver.Step. func (*Euler) Step() { y := M.Buffer() dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) torqueFn(dy0) setMaxTorque(dy0) // Adaptive time stepping: treat MaxErr as the maximum magnetization delta // (proportional to the error, but an overestimation for sure) var dt float32 if FixDt != 0 { Dt_si = FixDt dt = float32(Dt_si * GammaLL) } else { dt = float32(MaxErr / LastTorque) Dt_si = float64(dt) / GammaLL } util.AssertMsg(dt > 0, "Euler solver requires fixed time step > 0") setLastErr(float64(dt) * LastTorque) cuda.Madd2(y, y, dy0, 1, dt) // y = y + dt * dy M.normalize() Time += Dt_si NSteps++ } func (*Euler) Free() {} 3-3.11.1/engine/exchange.go000066400000000000000000000160521503346766200153350ustar00rootroot00000000000000package engine // Exchange interaction (Heisenberg + Dzyaloshinskii-Moriya) // See also cuda/exchange.cu and cuda/dmi.cu import ( "math" "unsafe" "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( Aex = NewScalarParam("Aex", "J/m", "Exchange stiffness", &lex2) Dind = NewScalarParam("Dind", "J/m2", "Interfacial Dzyaloshinskii-Moriya strength", &din2) Dbulk = NewScalarParam("Dbulk", "J/m2", "Bulk Dzyaloshinskii-Moriya strength", &dbulk2) lex2 exchParam // inter-cell Aex din2 exchParam // inter-cell Dind dbulk2 exchParam // inter-cell Dbulk B_exch = NewVectorField("B_exch", "T", "Exchange field", AddExchangeField) E_exch = NewScalarValue("E_exch", "J", "Total exchange energy (including the DMI energy)", GetExchangeEnergy) Edens_exch = NewScalarField("Edens_exch", "J/m3", "Total exchange energy density (including the DMI energy density)", AddExchangeEnergyDensity) // Average exchange coupling with neighbors. Useful to debug inter-region exchange ExchCoupling = NewScalarField("ExchCoupling", "arb.", "Average exchange coupling with neighbors", exchangeDecode) DindCoupling = NewScalarField("DindCoupling", "arb.", "Average DMI coupling with neighbors", dindDecode) OpenBC = false ) var AddExchangeEnergyDensity = makeEdensAdder(&B_exch, -0.5) // TODO: normal func func init() { registerEnergy(GetExchangeEnergy, AddExchangeEnergyDensity) DeclFunc("ext_ScaleExchange", ScaleInterExchange, "Re-scales exchange coupling between two regions.") DeclFunc("ext_InterExchange", InterExchange, "Sets exchange coupling between two regions.") DeclFunc("ext_ScaleDind", ScaleInterDind, "Re-scales Dind coupling between two regions.") DeclFunc("ext_InterDind", InterDind, "Sets Dind coupling between two regions.") DeclVar("OpenBC", &OpenBC, "Use open boundary conditions (default=false)") lex2.init(Aex) din2.init(Dind) dbulk2.init(Dbulk) } // Adds the current exchange field to dst func AddExchangeField(dst *data.Slice) { inter := !Dind.isZero() bulk := !Dbulk.isZero() ms := Msat.MSlice() defer ms.Recycle() switch { case !inter && !bulk: cuda.AddExchange(dst, M.Buffer(), lex2.Gpu(), ms, regions.Gpu(), M.Mesh()) case inter && !bulk: Refer("mulkers2017") cuda.AddDMI(dst, M.Buffer(), lex2.Gpu(), din2.Gpu(), ms, regions.Gpu(), M.Mesh(), OpenBC) // dmi+exchange case bulk && !inter: cuda.AddDMIBulk(dst, M.Buffer(), lex2.Gpu(), dbulk2.Gpu(), ms, regions.Gpu(), M.Mesh(), OpenBC) // dmi+exchange // TODO: add ScaleInterDbulk and InterDbulk case inter && bulk: util.Fatal("Cannot have interfacial-induced DMI and bulk DMI at the same time") } } // Set dst to the average exchange coupling per cell (average of lex2 with all neighbors). func exchangeDecode(dst *data.Slice) { cuda.ExchangeDecode(dst, lex2.Gpu(), regions.Gpu(), M.Mesh()) } // Set dst to the average dmi coupling per cell (average of din2 with all neighbors). func dindDecode(dst *data.Slice) { cuda.ExchangeDecode(dst, din2.Gpu(), regions.Gpu(), M.Mesh()) } // Returns the current exchange energy in Joules. func GetExchangeEnergy() float64 { return -0.5 * cellVolume() * dot(&M_full, &B_exch) } // Scales the heisenberg exchange interaction between region1 and 2. // Scale = 1 means the harmonic mean over the regions of Aex. func ScaleInterExchange(region1, region2 int, scale float64) { lex2.setScale(region1, region2, scale) } // Sets the exchange interaction between region 1 and 2. func InterExchange(region1, region2 int, value float64) { lex2.setInter(region1, region2, value) } // Scales the DMI interaction between region 1 and 2. func ScaleInterDind(region1, region2 int, scale float64) { din2.setScale(region1, region2, scale) } // Sets the DMI interaction between region 1 and 2. func InterDind(region1, region2 int, value float64) { din2.setInter(region1, region2, value) } // stores interregion exchange stiffness and DMI // the interregion exchange/DMI by default is the harmonic mean (scale=1, inter=0) type exchParam struct { parent *RegionwiseScalar lut [NREGION * (NREGION + 1) / 2]float32 // harmonic mean of regions (i,j) scale [NREGION * (NREGION + 1) / 2]float32 // extra scale factor for lut[SymmIdx(i, j)] inter [NREGION * (NREGION + 1) / 2]float32 // extra term for lut[SymmIdx(i, j)] gpu cuda.SymmLUT // gpu copy of lut, lazily transferred when needed gpu_ok, cpu_ok bool // gpu cache up-to-date with lut source } // to be called after Aex, Dind, Msat or scaling changed func (p *exchParam) invalidate() { p.cpu_ok = false p.gpu_ok = false } func (p *exchParam) init(parent *RegionwiseScalar) { for i := range p.scale { p.scale[i] = 1 // default scaling p.inter[i] = 0 // default additional interexchange term } p.parent = parent } // Get a GPU mirror of the look-up table. // Copies to GPU first only if needed. func (p *exchParam) Gpu() cuda.SymmLUT { p.update() if !p.gpu_ok { p.upload() } return p.gpu } // sets the interregion exchange/DMI using a specified value (scale = 0) func (p *exchParam) setInter(region1, region2 int, value float64) { p.scale[symmidx(region1, region2)] = float32(0.) p.inter[symmidx(region1, region2)] = float32(value) p.invalidate() } // sets the interregion exchange/DMI by rescaling the harmonic mean (inter = 0) func (p *exchParam) setScale(region1, region2 int, scale float64) { p.scale[symmidx(region1, region2)] = float32(scale) p.inter[symmidx(region1, region2)] = float32(0.) p.invalidate() } func (p *exchParam) update() { if !p.cpu_ok { ex := p.parent.cpuLUT() msat := Msat.cpuLUT() for i := 0; i < NREGION; i++ { exi := ex[0][i] * sign32(msat[0][i]) for j := i; j < NREGION; j++ { exj := ex[0][j] * sign32(msat[0][j]) I := symmidx(i, j) p.lut[I] = p.scale[I]*exchAverage(exi, exj) + p.inter[I] } } p.gpu_ok = false p.cpu_ok = true } } func (p *exchParam) upload() { // alloc if needed if p.gpu == nil { p.gpu = cuda.SymmLUT(cuda.MemAlloc(int64(len(p.lut)) * cu.SIZEOF_FLOAT32)) } lut := p.lut // Copy, to work around Go 1.6 cgo pointer limitations. cuda.MemCpyHtoD(unsafe.Pointer(p.gpu), unsafe.Pointer(&lut[0]), cu.SIZEOF_FLOAT32*int64(len(p.lut))) p.gpu_ok = true } // Index in symmetric matrix where only one half is stored. // (!) Code duplicated in exchange.h func symmidx(i, j int) int { if j <= i { return i*(i+1)/2 + j } else { return j*(j+1)/2 + i } } // Returns the intermediate value of two exchange/dmi strengths. // If both arguments have the same sign, the average mean is returned. If the arguments differ in sign // (which is possible in the case of DMI), the geometric mean of the geometric and arithmetic mean is // used. This average is continuous everywhere, monotonic increasing, and bounded by the argument values. func exchAverage(exi, exj float32) float32 { if exi*exj >= 0.0 { return 2 / (1/exi + 1/exj) } else { exi_, exj_ := float64(exi), float64(exj) sign := math.Copysign(1, exi_+exj_) magn := math.Sqrt(math.Sqrt(-exi_*exj_) * math.Abs(exi_+exj_) / 2) return float32(sign * magn) } } 3-3.11.1/engine/excitation.go000066400000000000000000000102451503346766200157200ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/script" "github.com/mumax/3/util" "math" "reflect" ) // An excitation, typically field or current, // can be defined region-wise plus extra mask*multiplier terms. type Excitation struct { name string perRegion RegionwiseVector // Region-based excitation extraTerms []mulmask // add extra mask*multiplier terms } // space-dependent mask plus time dependent multiplier type mulmask struct { mul func() float64 mask *data.Slice } func NewExcitation(name, unit, desc string) *Excitation { e := new(Excitation) e.name = name e.perRegion.init(3, "_"+name+"_perRegion", unit, nil) // name starts with underscore: unexported DeclLValue(name, e, cat(desc, unit)) return e } func (p *Excitation) MSlice() cuda.MSlice { buf, r := p.Slice() util.Assert(r == true) return cuda.ToMSlice(buf) } func (e *Excitation) AddTo(dst *data.Slice) { if !e.perRegion.isZero() { cuda.RegionAddV(dst, e.perRegion.gpuLUT(), regions.Gpu()) } for _, t := range e.extraTerms { var mul float32 = 1 if t.mul != nil { mul = float32(t.mul()) } cuda.Madd2(dst, dst, t.mask, 1, mul) } } func (e *Excitation) isZero() bool { return e.perRegion.isZero() && len(e.extraTerms) == 0 } func (e *Excitation) Slice() (*data.Slice, bool) { buf := cuda.Buffer(e.NComp(), e.Mesh().Size()) cuda.Zero(buf) e.AddTo(buf) return buf, true } // After resizing the mesh, the extra terms don't fit the grid anymore // and there is no reasonable way to resize them. So remove them and have // the user re-add them. func (e *Excitation) RemoveExtraTerms() { if len(e.extraTerms) == 0 { return } LogOut("REMOVING EXTRA TERMS FROM", e.Name()) for _, m := range e.extraTerms { m.mask.Free() } e.extraTerms = nil } // Add an extra mask*multiplier term to the excitation. func (e *Excitation) Add(mask *data.Slice, f script.ScalarFunction) { var mul func() float64 if f != nil { if IsConst(f) { val := f.Float() mul = func() float64 { return val } } else { mul = func() float64 { return f.Float() } } } e.AddGo(mask, mul) } // An Add(mask, f) equivalent for Go use func (e *Excitation) AddGo(mask *data.Slice, mul func() float64) { if mask != nil { checkNaN(mask, e.Name()+".add()") // TODO: in more places mask = data.Resample(mask, e.Mesh().Size()) mask = assureGPU(mask) } e.extraTerms = append(e.extraTerms, mulmask{mul, mask}) } func (e *Excitation) SetRegion(region int, f script.VectorFunction) { e.perRegion.SetRegion(region, f) } func (e *Excitation) SetValue(v interface{}) { e.perRegion.SetValue(v) } func (e *Excitation) Set(v data.Vector) { e.perRegion.setRegions(0, NREGION, slice(v)) } func (e *Excitation) getRegion(region int) []float64 { return e.perRegion.getRegion(region) } // for gui func (e *Excitation) SetRegionFn(region int, f func() [3]float64) { e.perRegion.setFunc(region, region+1, func() []float64 { return slice(f()) }) } func (e *Excitation) average() []float64 { return qAverageUniverse(e) } func (e *Excitation) Average() data.Vector { return unslice(qAverageUniverse(e)) } func (e *Excitation) IsUniform() bool { return e.perRegion.IsUniform() } func (e *Excitation) Name() string { return e.name } func (e *Excitation) Unit() string { return e.perRegion.Unit() } func (e *Excitation) NComp() int { return e.perRegion.NComp() } func (e *Excitation) Mesh() *data.Mesh { return Mesh() } func (e *Excitation) Region(r int) *vOneReg { return vOneRegion(e, r) } func (e *Excitation) Comp(c int) ScalarField { return Comp(e, c) } func (e *Excitation) Eval() interface{} { return e } func (e *Excitation) Type() reflect.Type { return reflect.TypeOf(new(Excitation)) } func (e *Excitation) InputType() reflect.Type { return script.VectorFunction_t } func (e *Excitation) EvalTo(dst *data.Slice) { EvalTo(e, dst) } func checkNaN(s *data.Slice, name string) { h := s.Host() for _, h := range h { for _, v := range h { if math.IsNaN(float64(v)) || math.IsInf(float64(v), 0) { util.Fatal("NaN or Inf in", name) } } } } 3-3.11.1/engine/ext_angles.go000066400000000000000000000005601503346766200157010ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( ext_phi = NewScalarField("ext_phi", "rad", "Azimuthal angle", SetPhi) ext_theta = NewScalarField("ext_theta", "rad", "Polar angle", SetTheta) ) func SetPhi(dst *data.Slice) { cuda.SetPhi(dst, M.Buffer()) } func SetTheta(dst *data.Slice) { cuda.SetTheta(dst, M.Buffer()) } 3-3.11.1/engine/ext_bubblepos.go000066400000000000000000000051211503346766200164030ustar00rootroot00000000000000package engine import ( "math" ) var ( BubblePos = NewVectorValue("ext_bubblepos", "m", "Bubble core position", bubblePos) BubbleDist = NewScalarValue("ext_bubbledist", "m", "Bubble traveled distance", bubbleDist) BubbleSpeed = NewScalarValue("ext_bubblespeed", "m/s", "Bubble velocity", bubbleSpeed) BubbleMz = 1.0 BackGroundTilt = 0.25 ) func init() { DeclVar("ext_BubbleMz", &BubbleMz, "Center magnetization 1.0 or -1.0 (default = 1.0)") DeclVar("ext_BackGroundTilt", &BackGroundTilt, "Size of in-plane component of background magnetization. All values below this one are rounded down to perfectly out-of-plane to improve position calculation (default = 0.25)") } func bubblePos() []float64 { m := M.Buffer() n := Mesh().Size() c := Mesh().CellSize() g := geometry.Gpu() var geo [][]float32 if !g.IsNil() { geo = g.Comp(0).HostCopy().Scalars()[0] // geometry[Y, X] } mz := m.Comp(Z).HostCopy().Scalars()[0] posx, posy := 0., 0. if BubbleMz != -1.0 && BubbleMz != 1.0 { panic("ext_BubbleMz should be 1.0 or -1.0") } { var mag float64 var magsum float64 var weightedsumx float64 var weightedsumy float64 for ix := range mz[0] { for iy := range mz { mag = backgroundAdjust(mz[iy][ix]*float32(BubbleMz) + 1.) // 1/2 is divided out // weight cells according to geometry: 0 weight outside if !g.IsNil() { mag *= float64(geo[iy][ix]) } magsum += mag weightedsumx += mag * float64(ix) weightedsumy += mag * float64(iy) } } posx = float64(weightedsumx / magsum) posy = float64(weightedsumy / magsum) } return []float64{(posx-float64(n[X]/2))*c[X] + GetShiftPos(), (posy-float64(n[Y]/2))*c[Y] + GetShiftYPos(), 0.} } var ( prevBpos = [2]float64{-1e99, -1e99} bdist = 0.0 ) func bubbleDist() float64 { pos := bubblePos() if prevBpos == [2]float64{-1e99, -1e99} { prevBpos = [2]float64{pos[X], pos[Y]} return 0 } w := Mesh().WorldSize() dx := pos[X] - prevBpos[X] dy := pos[Y] - prevBpos[Y] prevBpos = [2]float64{pos[X], pos[Y]} // PBC wrap if dx > w[X]/2 { dx -= w[X] } if dx < -w[X]/2 { dx += w[X] } if dy > w[Y]/2 { dy -= w[Y] } if dy < -w[Y]/2 { dy += w[Y] } bdist += math.Sqrt(dx*dx + dy*dy) return bdist } var ( prevBdist = 0.0 prevBt = -999.0 ) func bubbleSpeed() float64 { dist := bubbleDist() if prevBt < 0 { prevBdist = dist prevBt = Time return 0 } v := (dist - prevBdist) / (Time - prevBt) prevBt = Time prevBdist = dist return v } func backgroundAdjust(arg float32) float64 { if float64(arg) < BackGroundTilt { return float64(0) } return float64(arg) } 3-3.11.1/engine/ext_centerbubble.go000066400000000000000000000025211503346766200170630ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/data" ) var ( ext_enableCenterBubbleX = true ext_enableCenterBubbleY = true ) func init() { DeclFunc("ext_centerBubble", CenterBubble, "centerBubble shifts m after each step to keep the bubble position close to the center of the window") DeclVar("ext_enableCenterBubbleX", &ext_enableCenterBubbleX, "Enables centering along the X-axis during ext_centerBubble (default=true)") DeclVar("ext_enableCenterBubbleY", &ext_enableCenterBubbleY, "Enables centering along the Y-axis during ext_centerBubble (default=true)") } func centerBubble() { c := Mesh().CellSize() position := bubblePos() var centerIdx [2]int centerIdx[X] = int(math.Floor((position[X] - GetShiftPos()) / c[X])) centerIdx[Y] = int(math.Floor((position[Y] - GetShiftYPos()) / c[Y])) zero := data.Vector{0, 0, 0} if ShiftMagL == zero || ShiftMagR == zero || ShiftMagD == zero || ShiftMagU == zero { ShiftMagL[Z] = -BubbleMz ShiftMagR[Z] = -BubbleMz ShiftMagD[Z] = -BubbleMz ShiftMagU[Z] = -BubbleMz } //put bubble to center if ext_enableCenterBubbleX && (centerIdx[X] != 0) { Shift(-centerIdx[X]) } if ext_enableCenterBubbleY && (centerIdx[Y] != 0) { YShift(-centerIdx[Y]) } } // This post-step function centers the simulation window on a bubble func CenterBubble() { PostStep(func() { centerBubble() }) } 3-3.11.1/engine/ext_centerwall.go000066400000000000000000000073011503346766200165700ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/data" ) var ( DWPos = NewScalarValue("ext_dwpos", "m", "Position of the simulation window while following a domain wall", GetShiftPos) // TODO: make more accurate DWxPos = NewScalarValue("ext_dwxpos", "m", "Position of the simulation window while following a domain wall", GetDWxPos) DWSpeed = NewScalarValue("ext_dwspeed", "m/s", "Speed of the simulation window while following a domain wall", getShiftSpeed) ) func init() { DeclFunc("ext_centerWall", CenterWall, "centerWall(c) shifts m after each step to keep m_c close to zero") DeclFunc("ext_centerWallInRegion", CenterWallInRegion, "centerWallInRegion(R, c) shifts m after each step to keep m_c in region R close to zero") DeclFunc("ext_centerWallInLayer", CenterWallInLayer, "centerWallInLayer(L, c) shifts m after each step to keep m_c in layer L close to zero") } func centerWall(c int) { M := &M mc := sAverageUniverse(M.Buffer().Comp(c))[0] n := Mesh().Size() tolerance := 4 / float64(n[X]) // x*2 * expected change for 1 cell shift zero := data.Vector{0, 0, 0} if ShiftMagL == zero || ShiftMagR == zero { sign := magsign(M.GetCell(0, n[Y]/2, n[Z]/2)[c]) ShiftMagL[c] = float64(sign) ShiftMagR[c] = -float64(sign) } sign := magsign(ShiftMagL[c]) if mc < -tolerance { Shift(sign) } else if mc > tolerance { Shift(-sign) } } // This post-step function centers the simulation window on a domain wall // between up-down (or down-up) domains (like in perpendicular media). E.g.: // // PostStep(CenterPMAWall) func CenterWall(magComp int) { PostStep(func() { centerWall(magComp) }) } // The same functions as above, now for just one layer func centerWallInLayerProc(layer, c int) { M := &M mc := CropLayer(M, layer).average()[c] n := Mesh().Size() tolerance := 4 / float64(n[X]) // x*2 * expected change for 1 cell shift zero := data.Vector{0, 0, 0} if ShiftMagL == zero || ShiftMagR == zero { sign := magsign(M.GetCell(0, n[Y]/2, layer)[c]) ShiftMagL[c] = float64(sign) ShiftMagR[c] = -float64(sign) } sign := magsign(ShiftMagL[c]) if mc < -tolerance { Shift(sign) } else if mc > tolerance { Shift(-sign) } } func CenterWallInLayer(layer, magComp int) { PostStep(func() { centerWallInLayerProc(layer, magComp) }) } // The same functions as above, now for just one region func centerWallInRegionProc(region, c int) { M := &M mc := M.Region(region).Average()[c] n := Mesh().Size() tolerance := 4 / float64(n[X]) // x*2 * expected change for 1 cell shift zero := data.Vector{0, 0, 0} if ShiftMagL == zero || ShiftMagR == zero { sign := magsign(M.GetCell(0, n[Y]/2, n[Z]/2)[c]) ShiftMagL[c] = float64(sign) ShiftMagR[c] = -float64(sign) } sign := magsign(ShiftMagL[c]) if mc < -tolerance { Shift(sign) } else if mc > tolerance { Shift(-sign) } } func CenterWallInRegion(region, magComp int) { PostStep(func() { centerWallInRegionProc(region, magComp) }) } func magsign(x float64) int { if x > 0.1 { return 1 } if x < -0.1 { return -1 } panic(fmt.Errorf("center wall: unclear in which direction to shift: magnetization at border=%v. Set ShiftMagL, ShiftMagR", x)) } // used for speed var ( lastShift float64 // shift the last time we queried speed lastT float64 // time the last time we queried speed lastV float64 // speed the last time we queried speed ) func getShiftSpeed() float64 { if lastShift != GetShiftPos() { lastV = (GetShiftPos() - lastShift) / (Time - lastT) lastShift = GetShiftPos() lastT = Time } return lastV } func GetDWxPos() float64 { M := &M mx := sAverageUniverse(M.Buffer().Comp(0))[0] c := Mesh().CellSize() n := Mesh().Size() position := mx * c[0] * float64(n[0]) / 2. return GetShiftPos() + position } 3-3.11.1/engine/ext_corepos.go000066400000000000000000000025611503346766200161050ustar00rootroot00000000000000package engine var CorePos = NewVectorValue("ext_corepos", "m", "Vortex core position (x,y) + polarization (z)", corePos) func corePos() []float64 { m := M.Buffer() m_z := m.Comp(Z).HostCopy().Scalars() s := m.Size() Nx, Ny, Nz := s[X], s[Y], s[Z] max := float32(-1.0) var maxX, maxY, maxZ int for z := 0; z < Nz; z++ { // Avoid the boundaries so the neighbor interpolation can't go out of bounds. for y := 1; y < Ny-1; y++ { for x := 1; x < Nx-1; x++ { m := abs(m_z[z][y][x]) if m > max { maxX, maxY, maxZ = x, y, z max = m } } } } pos := make([]float64, 3) mz := m_z[maxZ] // sub-cell interpolation in X and Y, but not Z pos[X] = float64(maxX) + interpolate_maxpos( max, -1, abs(mz[maxY][maxX-1]), 1, abs(mz[maxY][maxX+1])) - float64(Nx)/2 + 0.5 pos[Y] = float64(maxY) + interpolate_maxpos( max, -1, abs(mz[maxY-1][maxX]), 1, abs(mz[maxY+1][maxX])) - float64(Ny)/2 + 0.5 c := Mesh().CellSize() pos[X] *= c[X] pos[Y] *= c[Y] pos[Z] = float64(m_z[maxZ][maxY][maxX]) // 3rd coordinate is core polarization pos[X] += GetShiftPos() // add simulation window shift return pos } func interpolate_maxpos(f0, d1, f1, d2, f2 float32) float64 { b := (f2 - f1) / (d2 - d1) a := ((f2-f0)/d2 - (f0-f1)/(-d1)) / (d2 - d1) return float64(-b / (2 * a)) } func abs(x float32) float32 { if x > 0 { return x } else { return -x } } 3-3.11.1/engine/ext_dwtilt.go000066400000000000000000000011571503346766200157420ustar00rootroot00000000000000package engine import ( "math" ) // PMA domain wall tilt assuming straight wall. var DWTiltPMA = NewScalarValue("ext_dwtilt", "rad", "PMA domain wall tilt", dwTiltPMA) func dwTiltPMA() float64 { m := Download(&M) mz := m.Vectors()[Z][0] // slice0 nx := Mesh().Size()[X] ny := Mesh().Size()[Y] // find domain wall at these y positions: y1 := 4 y2 := ny - 5 // search for x values where mz = 0 (=wall) x1, x2 := 0, 0 for i := 1; i < nx; i++ { if mz[y1][i-1]*mz[y1][i] < 0 { x1 = i } if mz[y2][i-1]*mz[y2][i] < 0 { x2 = i } } angle := math.Atan(float64(x1-x2) / float64(y1-y2)) return angle } 3-3.11.1/engine/ext_hopfindex.go000066400000000000000000000073161503346766200164220ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( Ext_HopfIndex_TwoPointStencil = NewScalarValue("ext_hopfindex_twopointstencil", "", "Hopf index calculated using two-point stencil", GetHopfIndex_TwoPointStencil) Ext_HopfIndexDensity_TwoPointStencil = NewScalarField("ext_hopfindexdensity_twopointstencil", "1/m3", "Hopf index density calculated using two-point stencil", SetHopfIndexDensity_TwoPointStencil) Ext_EmergentMagneticField_TwoPointStencil = NewVectorField("ext_emergentmagneticfield_twopointstencil", "1/m2", "Emergent magnetic field calculated using two-point stencil", SetEmergentMagneticField_TwoPointStencil) Ext_HopfIndex_FivePointStencil = NewScalarValue("ext_hopfindex_fivepointstencil", "", "Hopf index calculated using five-point stencil", GetHopfIndex_FivePointStencil) Ext_HopfIndexDensity_FivePointStencil = NewScalarField("ext_hopfindexdensity_fivepointstencil", "1/m3", "Hopf index density calculated using five-point stencil", SetHopfIndexDensity_FivePointStencil) Ext_EmergentMagneticField_FivePointStencil = NewVectorField("ext_emergentmagneticfield_fivepointstencil", "1/m2", "Emergent magnetic field calculated using five-point stencil", SetEmergentMagneticField_FivePointStencil) Ext_HopfIndex_SolidAngle = NewScalarValue("ext_hopfindex_solidangle", "", "Hopf index calculated using Berg-Lüscher lattice method", GetHopfIndex_SolidAngle) Ext_HopfIndexDensity_SolidAngle = NewScalarField("ext_hopfindexdensity_solidangle", "1/m3", "Hopf index density computed using Berg-Lüscher lattice method", SetHopfIndexDensity_SolidAngle) Ext_EmergentMagneticField_SolidAngle = NewVectorField("ext_emergentmagneticfield_solidangle", "1/m2", "Emergent magnetic field computed using Berg-Lüscher lattice method", SetEmergentMagneticField_SolidAngle) Ext_HopfIndex_SolidAngleFourier = NewScalarValue("ext_hopfindex_solidanglefourier", "", "Hopf index calculated using Berg-Lüscher lattice method to calculate emergent field, with emergent field Fourier transformed", GetHopfIndex_SolidAngleFourier) ) func GetHopfIndex_TwoPointStencil() float64 { Refer("Knapman2025") h := ValueOf(Ext_HopfIndexDensity_TwoPointStencil) defer cuda.Recycle(h) c := Mesh().CellSize() return -c[X] * c[Y] * c[Z] * float64(cuda.Sum(h)) } func SetHopfIndexDensity_TwoPointStencil(dst *data.Slice) { Refer("Knapman2025") cuda.SetHopfIndexDensity_TwoPointStencil(dst, M.Buffer(), M.Mesh()) } func SetEmergentMagneticField_TwoPointStencil(dst *data.Slice) { cuda.SetEmergentMagneticField_TwoPointStencil(dst, M.Buffer(), M.Mesh()) } func GetHopfIndex_FivePointStencil() float64 { Refer("Knapman2025") h := ValueOf(Ext_HopfIndexDensity_FivePointStencil) defer cuda.Recycle(h) c := Mesh().CellSize() return -c[X] * c[Y] * c[Z] * float64(cuda.Sum(h)) } func SetHopfIndexDensity_FivePointStencil(dst *data.Slice) { Refer("Knapman2025") cuda.SetHopfIndexDensity_FivePointStencil(dst, M.Buffer(), M.Mesh()) } func SetEmergentMagneticField_FivePointStencil(dst *data.Slice) { cuda.SetEmergentMagneticField_FivePointStencil(dst, M.Buffer(), M.Mesh()) } func GetHopfIndex_SolidAngle() float64 { Refer("Knapman2025") h := ValueOf(Ext_HopfIndexDensity_SolidAngle) defer cuda.Recycle(h) c := Mesh().CellSize() return -c[X] * c[Y] * c[Z] * float64(cuda.Sum(h)) } func SetHopfIndexDensity_SolidAngle(dst *data.Slice) { Refer("Knapman2025") cuda.SetHopfIndexDensity_SolidAngle(dst, M.Buffer(), M.Mesh()) } func SetEmergentMagneticField_SolidAngle(dst *data.Slice) { cuda.SetEmergentMagneticField_SolidAngle(dst, M.Buffer(), M.Mesh()) } func GetHopfIndex_SolidAngleFourier() float64 { Refer("Knapman2025") return cuda.GetHopfIndex_SolidAngleFourier(M.Buffer(), M.Mesh()) } 3-3.11.1/engine/ext_magnetoelastic.go000066400000000000000000000072271503346766200174360ustar00rootroot00000000000000package engine // Mangeto-elastic coupling. import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( B1 = NewScalarParam("B1", "J/m3", "First magneto-elastic coupling constant") B2 = NewScalarParam("B2", "J/m3", "Second magneto-elastic coupling constant") exx = NewScalarExcitation("exx", "", "exx component of the strain tensor") eyy = NewScalarExcitation("eyy", "", "eyy component of the strain tensor") ezz = NewScalarExcitation("ezz", "", "ezz component of the strain tensor") exy = NewScalarExcitation("exy", "", "exy component of the strain tensor") exz = NewScalarExcitation("exz", "", "exz component of the strain tensor") eyz = NewScalarExcitation("eyz", "", "eyz component of the strain tensor") B_mel = NewVectorField("B_mel", "T", "Magneto-elastic filed", AddMagnetoelasticField) F_mel = NewVectorField("F_mel", "N/m3", "Magneto-elastic force density", GetMagnetoelasticForceDensity) Edens_mel = NewScalarField("Edens_mel", "J/m3", "Magneto-elastic energy density", AddMagnetoelasticEnergyDensity) E_mel = NewScalarValue("E_mel", "J", "Magneto-elastic energy", GetMagnetoelasticEnergy) ) var ( zeroMel = NewScalarParam("_zeroMel", "", "utility zero parameter") ) func init() { registerEnergy(GetMagnetoelasticEnergy, AddMagnetoelasticEnergyDensity) } func AddMagnetoelasticField(dst *data.Slice) { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return } Exx := exx.MSlice() defer Exx.Recycle() Eyy := eyy.MSlice() defer Eyy.Recycle() Ezz := ezz.MSlice() defer Ezz.Recycle() Exy := exy.MSlice() defer Exy.Recycle() Exz := exz.MSlice() defer Exz.Recycle() Eyz := eyz.MSlice() defer Eyz.Recycle() b1 := B1.MSlice() defer b1.Recycle() b2 := B2.MSlice() defer b2.Recycle() ms := Msat.MSlice() defer ms.Recycle() cuda.AddMagnetoelasticField(dst, M.Buffer(), Exx, Eyy, Ezz, Exy, Exz, Eyz, b1, b2, ms) } func GetMagnetoelasticForceDensity(dst *data.Slice) { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return } util.AssertMsg(B1.IsUniform() && B2.IsUniform(), "Magnetoelastic: B1, B2 must be uniform") b1 := B1.MSlice() defer b1.Recycle() b2 := B2.MSlice() defer b2.Recycle() cuda.GetMagnetoelasticForceDensity(dst, M.Buffer(), b1, b2, M.Mesh()) } func AddMagnetoelasticEnergyDensity(dst *data.Slice) { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return } buf := cuda.Buffer(B_mel.NComp(), B_mel.Mesh().Size()) defer cuda.Recycle(buf) // unnormalized magnetization: Mf := ValueOf(M_full) defer cuda.Recycle(Mf) Exx := exx.MSlice() defer Exx.Recycle() Eyy := eyy.MSlice() defer Eyy.Recycle() Ezz := ezz.MSlice() defer Ezz.Recycle() Exy := exy.MSlice() defer Exy.Recycle() Exz := exz.MSlice() defer Exz.Recycle() Eyz := eyz.MSlice() defer Eyz.Recycle() b1 := B1.MSlice() defer b1.Recycle() b2 := B2.MSlice() defer b2.Recycle() ms := Msat.MSlice() defer ms.Recycle() zeromel := zeroMel.MSlice() defer zeromel.Recycle() // 1st cuda.Zero(buf) cuda.AddMagnetoelasticField(buf, M.Buffer(), Exx, Eyy, Ezz, Exy, Exz, Eyz, b1, zeromel, ms) cuda.AddDotProduct(dst, -1./2., buf, Mf) // 2nd cuda.Zero(buf) cuda.AddMagnetoelasticField(buf, M.Buffer(), Exx, Eyy, Ezz, Exy, Exz, Eyz, zeromel, b2, ms) cuda.AddDotProduct(dst, -1./1., buf, Mf) } // Returns magneto-ell energy in joules. func GetMagnetoelasticEnergy() float64 { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return float64(0.0) } buf := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(buf) cuda.Zero(buf) AddMagnetoelasticEnergyDensity(buf) return cellVolume() * float64(cuda.Sum(buf)) } 3-3.11.1/engine/ext_make3dgrains.go000066400000000000000000000114101503346766200167740ustar00rootroot00000000000000// 3D Voronoi tessellation. Contributed by Peyton Murray. package engine import ( "math" "math/rand" ) var GrainCutShape = false // complete all voronoi grains whose centre lies within the shape. Warning, this also cuts away parts of the shape whose closest voronoi centre lies outside the shape func init() { DeclFunc("ext_make3dgrains", Voronoi3d, "3D Voronoi tesselation over shape (grain size, starting region number, num regions, shape, seed)") DeclVar("ext_grainCutShape", &GrainCutShape, "Whether to add the complete (3D) voronoi grain, only if its centre lies within the shape (default=false)") } func Voronoi3d(grainsize float64, startRegion int, numRegions int, inputShape Shape, seed int) { Refer("Lel2014") SetBusy(true) defer SetBusy(false) t := newTesselation3d(grainsize, numRegions, int64(seed), startRegion, inputShape) regions.hist = append(regions.hist, t.RegionOf) regions.render(t.RegionOf) } type tesselation3d struct { grainsize float64 maxRegion int rnd *rand.Rand startRegion int shape Shape //Shape of the tesselated region centers []center3d //List of Voronoi centers } // Stores location of each Voronoi center type center3d struct { x, y, z float64 // center position (m) region byte // region for all cells near center } // Stores location of each cell type cellLocs struct{ x, y, z float64 } // nRegion exclusive func newTesselation3d(grainsize float64, nRegion int, seed int64, startRegion int, inputShape Shape) *tesselation3d { t := tesselation3d{grainsize, nRegion, rand.New(rand.NewSource(seed)), startRegion, inputShape, make([]center3d, 0)} t.makeRandomCenters() return &t } // Permutes the slice of cell locations. I don't understand why this needs to be done if we're choosing // random (Intn()) cells out of the slice of cell locations, but hey, it seems to do the trick. func (t *tesselation3d) shuffleCells(src []cellLocs) []cellLocs { dest := make([]cellLocs, len(src)) perm := t.rnd.Perm(len(src)) for i, v := range perm { dest[v] = src[i] } return dest } func (t *tesselation3d) makeRandomCenters() { //Make a list of all the cells in the shape. cells := t.tabulateCells() cells = t.shuffleCells(cells) //Choose number of grains to make. Assume volume of grain is given by (4/3)*pi*r^3 shapeVolume := cellVolume() * float64(len(cells)) grainVolume := (float64(1) / 6) * math.Pi * t.grainsize * t.grainsize * t.grainsize nAvgGrains := shapeVolume / grainVolume nGrains := t.truncNorm(nAvgGrains) //TODO: same cell can be chosen twice by random chance t.centers = make([]center3d, nGrains) for p := 0; p < nGrains; p++ { rndCell := cells[t.rnd.Intn(nGrains)] t.centers[p].x = rndCell.x t.centers[p].y = rndCell.y t.centers[p].z = rndCell.z randRegion := t.startRegion + t.rnd.Intn(t.maxRegion) t.centers[p].region = byte(randRegion) } } // Creates a slice of all cells which fall in the shape specified in the constructor. func (t *tesselation3d) tabulateCells() []cellLocs { //Initialze array of cells cells := make([]cellLocs, 0) //Get the mesh size meshSize := MeshSize() //Iterate across all cells in the mesh, and append those that are inside the shape for ix := 0; ix < meshSize[0]; ix++ { for iy := 0; iy < meshSize[1]; iy++ { for iz := 0; iz < meshSize[2]; iz++ { cell := Index2Coord(ix, iy, iz) x := cell.X() y := cell.Y() z := cell.Z() if t.shape(x, y, z) || GrainCutShape { cells = append(cells, cellLocs{x, y, z}) } } } } print("Number of cells in region: ", len(cells), "\n") print("Number of cells in universe: ", meshSize[0]*meshSize[1]*meshSize[2], "\n") return cells } func (t *tesselation3d) RegionOf(x, y, z float64) int { // Check if the point is within the shape or if we're cutting the shape along the grains if !(t.shape(x, y, z) || GrainCutShape) { return -1 // Regions < 0 won't be rastered } // Find the nearest center point to the (x, y, z) position nearest := center3d{x, y, z, 0} mindist := math.Inf(1) for _, c := range t.centers { dist := sqr(x-c.x) + sqr(y-c.y) + sqr(z-c.z) if dist < mindist { nearest = c mindist = dist } } // Check if the nearest point's region should be returned if (t.shape(x, y, z) && !GrainCutShape) || (t.shape(nearest.x, nearest.y, nearest.z) && GrainCutShape) { return int(nearest.region) } return -1 } // Generate normally distributed numbers; mean = lambda, variance = lambda. If generated number < 0, return 1. // Equivalent to Poisson distribution (with mean = lambda) for large lambda (which is usually true, since the volume // of a grain is usually much less than the simulation volume. func (t *tesselation3d) truncNorm(lambda float64) int { ret := lambda + math.Sqrt(lambda)*t.rnd.NormFloat64() if ret <= 0 { return 1 } else { return int(ret + 0.5) } } 3-3.11.1/engine/ext_makegrains.go000066400000000000000000000060011503346766200165450ustar00rootroot00000000000000package engine import ( "math" "math/rand" ) func init() { DeclFunc("ext_makegrains", Voronoi, "Voronoi tesselation (grain size, num regions)") } func Voronoi(grainsize float64, numRegions, seed int) { Refer("Lel2014") SetBusy(true) defer SetBusy(false) t := newTesselation(grainsize, numRegions, int64(seed)) regions.hist = append(regions.hist, t.RegionOf) regions.render(t.RegionOf) } type tesselation struct { grainsize float64 tilesize float64 maxRegion int cache map[int2][]center seed int64 rnd *rand.Rand } // integer tile coordinate type int2 struct{ x, y int } // Voronoi center info type center struct { x, y float64 // center position (m) region byte // region for all cells near center } // nRegion exclusive func newTesselation(grainsize float64, nRegion int, seed int64) *tesselation { return &tesselation{grainsize, float64(float32(grainsize * TILE)), // expect 4 grains/block, 36 per 3x3 blocks = safe, relatively round number nRegion, make(map[int2][]center), seed, rand.New(rand.NewSource(0))} } const ( TILE = 2 // tile size in grains LAMBDA = TILE * TILE // expected grains per tile ) // Returns the region of the grain where cell at x,y,z belongs to func (t *tesselation) RegionOf(x, y, z float64) int { tile := t.tileOf(x, y) // tile containing x,y // look for nearest center in tile + neighbors nearest := center{x, y, 0} // dummy initial value, but safe should the infinite impossibility strike. mindist := math.Inf(1) for tx := tile.x - 1; tx <= tile.x+1; tx++ { for ty := tile.y - 1; ty <= tile.y+1; ty++ { centers := t.centersInTile(tx, ty) for _, c := range centers { dist := sqr(x-c.x) + sqr(y-c.y) if dist < mindist { nearest = c mindist = dist } } } } //fmt.Println("nearest", x, y, ":", nearest) return int(nearest.region) } // Returns the list of Voronoi centers in tile(ix, iy), using only ix,iy to seed the random generator func (t *tesselation) centersInTile(tx, ty int) []center { pos := int2{tx, ty} if c, ok := t.cache[pos]; ok { return c } else { // tile-specific seed that works for positive and negative tx, ty seed := (int64(ty)+(1<<24))*(1<<24) + (int64(tx) + (1 << 24)) t.rnd.Seed(seed ^ t.seed) N := t.poisson(LAMBDA) c := make([]center, N) // absolute position of tile (m) x0, y0 := float64(tx)*t.tilesize, float64(ty)*t.tilesize for i := range c { // random position inside tile c[i].x = x0 + t.rnd.Float64()*t.tilesize c[i].y = y0 + t.rnd.Float64()*t.tilesize c[i].region = byte(t.rnd.Intn(t.maxRegion)) } t.cache[pos] = c return c } } func sqr(x float64) float64 { return x * x } func (t *tesselation) tileOf(x, y float64) int2 { ix := int(math.Floor(x / t.tilesize)) iy := int(math.Floor(y / t.tilesize)) return int2{ix, iy} } // Generate poisson distributed numbers (according to Knuth) func (t *tesselation) poisson(lambda float64) int { L := math.Exp(-lambda) k := 1 p := t.rnd.Float64() for p > L { k++ p *= t.rnd.Float64() } return k - 1 } 3-3.11.1/engine/ext_rmsurfacecharge.go000066400000000000000000000052001503346766200175650ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/data" "github.com/mumax/3/mag" "github.com/mumax/3/util" "math" ) func init() { DeclFunc("ext_rmSurfaceCharge", RemoveLRSurfaceCharge, "Compensate magnetic charges on the left and right sides of an in-plane magnetized wire. Arguments: region, mx on left and right side, resp.") } // For a nanowire magnetized in-plane, with mx = mxLeft on the left end and // mx = mxRight on the right end (both -1 or +1), add a B field needed to compensate // for the surface charges on the left and right edges. // This will mimic an infinitely long wire. func RemoveLRSurfaceCharge(region int, mxLeft, mxRight float64) { SetBusy(true) defer SetBusy(false) util.Argument(mxLeft == 1 || mxLeft == -1) util.Argument(mxRight == 1 || mxRight == -1) bsat := Msat.GetRegion(region) * mag.Mu0 util.AssertMsg(bsat != 0, "RemoveSurfaceCharges: Msat is zero in region "+fmt.Sprint(region)) B_ext.Add(compensateLRSurfaceCharges(Mesh(), mxLeft, mxRight, bsat), nil) } func compensateLRSurfaceCharges(m *data.Mesh, mxLeft, mxRight float64, bsat float64) *data.Slice { h := data.NewSlice(3, m.Size()) H := h.Vectors() world := m.WorldSize() cell := m.CellSize() size := m.Size() q := cell[Z] * cell[Y] * bsat q1 := q * mxLeft q2 := q * (-mxRight) prog, maxProg := 0, (size[Z]+1)*(size[Y]+1) // surface loop (source) for I := 0; I < size[Z]; I++ { for J := 0; J < size[Y]; J++ { prog++ util.Progress(prog, maxProg, "removing surface charges") y := (float64(J) + 0.5) * cell[Y] z := (float64(I) + 0.5) * cell[Z] source1 := [3]float64{0, y, z} // left surface source source2 := [3]float64{world[X], y, z} // right surface source // volume loop (destination) for iz := range H[0] { for iy := range H[0][iz] { for ix := range H[0][iz][iy] { dst := [3]float64{ // destination coordinate (float64(ix) + 0.5) * cell[X], (float64(iy) + 0.5) * cell[Y], (float64(iz) + 0.5) * cell[Z]} h1 := hfield(q1, source1, dst) h2 := hfield(q2, source2, dst) // add this surface charges' field to grand total for c := 0; c < 3; c++ { H[c][iz][iy][ix] += float32(h1[c] + h2[c]) } } } } } } return h } // H field of charge at location source, evaluated in location dest. func hfield(charge float64, source, dest [3]float64) [3]float64 { var R [3]float64 R[0] = dest[0] - source[0] R[1] = dest[1] - source[1] R[2] = dest[2] - source[2] r := math.Sqrt(R[0]*R[0] + R[1]*R[1] + R[2]*R[2]) qr3pi4 := charge / ((4 * math.Pi) * r * r * r) var h [3]float64 h[0] = R[0] * qr3pi4 h[1] = R[1] * qr3pi4 h[2] = R[2] * qr3pi4 return h } 3-3.11.1/engine/ext_topologicalcharge.go000066400000000000000000000013451503346766200201200ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( Ext_TopologicalCharge = NewScalarValue("ext_topologicalcharge", "", "2D topological charge", GetTopologicalCharge) Ext_TopologicalChargeDensity = NewScalarField("ext_topologicalchargedensity", "1/m2", "2D topological charge density m·(∂m/∂x ✕ ∂m/∂y)", SetTopologicalChargeDensity) ) func SetTopologicalChargeDensity(dst *data.Slice) { cuda.SetTopologicalCharge(dst, M.Buffer(), M.Mesh()) } func GetTopologicalCharge() float64 { s := ValueOf(Ext_TopologicalChargeDensity) defer cuda.Recycle(s) c := Mesh().CellSize() N := Mesh().Size() return (0.25 * c[X] * c[Y] / math.Pi / float64(N[Z])) * float64(cuda.Sum(s)) } 3-3.11.1/engine/ext_topologicalchargelattice.go000066400000000000000000000015371503346766200214710ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( Ext_TopologicalChargeLattice = NewScalarValue("ext_topologicalchargelattice", "", "2D topological charge according to Berg and Lüscher", GetTopologicalChargeLattice) Ext_TopologicalChargeDensityLattice = NewScalarField("ext_topologicalchargedensitylattice", "1/m2", "2D topological charge density according to Berg and Lüscher", SetTopologicalChargeDensityLattice) ) func SetTopologicalChargeDensityLattice(dst *data.Slice) { Refer("Berg1981") cuda.SetTopologicalChargeLattice(dst, M.Buffer(), M.Mesh()) } func GetTopologicalChargeLattice() float64 { s := ValueOf(Ext_TopologicalChargeDensityLattice) defer cuda.Recycle(s) c := Mesh().CellSize() N := Mesh().Size() return (0.25 * c[X] * c[Y] / math.Pi / float64(N[Z])) * float64(cuda.Sum(s)) } 3-3.11.1/engine/functionfromfile.go000066400000000000000000000062531503346766200171260ustar00rootroot00000000000000package engine import ( "encoding/csv" "io" "os" "strconv" "strings" "github.com/mumax/3/util" ) func init() { DeclFunc("FunctionFromDatafile", FunctionFromDatafile, "Creates an interpolation function using data from two columns in a csv file. "+ "Arguments: filename, xColumnIdx, yColumnIdx, method (\"linear\", \"nearest\" or \"step\").") } func isStrictlyIncreasing(x []float64) bool { for i := 1; i < len(x); i++ { if x[i] <= x[i-1] { return false } } return true } func InterpolationFunction(xData, yData []float64, method string) func(float64) float64 { util.AssertMsg(len(xData) == len(yData), "Interpolation error: given data slices do not have the same length") util.AssertMsg(len(xData) != 0, "Interpolation error: data slices are empty") util.AssertMsg(isStrictlyIncreasing(xData), "Interpolation error: X values are not strictly increasing") switch method { case "nearest": return nearestInterpolationFunction(xData, yData) case "step": return stepInterpolationFunction(xData, yData) case "linear": return linearInterpolationFunction(xData, yData) default: util.Fatal("Interpolation method \"" + method + "\" is not implemented") return nil } } func nearestInterpolationFunction(xData, yData []float64) func(float64) float64 { return func(x float64) float64 { ib := 0 // index for the smallest xData value larger than x for ; ib < len(xData); ib++ { if x < xData[ib] { break } } if ib == 0 { return yData[0] } if ib == len(xData) { return yData[len(xData)-1] } ia := ib - 1 // index for the largest xData value smaller than x xa, ya := xData[ia], yData[ia] xb, yb := xData[ib], yData[ib] if x-xa < xb-x { return ya } else { return yb } } } func stepInterpolationFunction(xData, yData []float64) func(float64) float64 { return func(x float64) float64 { if x < xData[0] { return 0.0 } for i := 0; i < len(xData)-1; i++ { if x >= xData[i] && x < xData[i+1] { return yData[i] } } return yData[len(yData)-1] } } func linearInterpolationFunction(xData, yData []float64) func(float64) float64 { return func(x float64) float64 { ib := 0 // index for the smallest xData value larger than x for ; ib < len(xData); ib++ { if x < xData[ib] { break } } if ib == 0 { return yData[0] } if ib == len(xData) { return yData[len(xData)-1] } ia := ib - 1 // index for the largest xData value smaller than x xa, ya := xData[ia], yData[ia] xb, yb := xData[ib], yData[ib] return ya + (x-xa)*(yb-ya)/(xb-xa) } } func FunctionFromDatafile(fname string, xCol, yCol int, method string) func(float64) float64 { csvfile, err := os.Open(fname) util.FatalErr(err) defer csvfile.Close() r := csv.NewReader(csvfile) r.Comment = '#' xData := make([]float64, 0) yData := make([]float64, 0) for { line, err := r.Read() if err == io.EOF { break } else { util.FatalErr(err) } x_, err := strconv.ParseFloat(strings.TrimSpace(line[xCol]), 64) util.FatalErr(err) y_, err := strconv.ParseFloat(strings.TrimSpace(line[yCol]), 64) util.FatalErr(err) xData = append(xData, x_) yData = append(yData, y_) } return InterpolationFunction(xData, yData, method) } 3-3.11.1/engine/geom.go000066400000000000000000000173631503346766200145100ustar00rootroot00000000000000package engine import ( "math/rand" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/httpfs" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) func init() { DeclFunc("SetGeom", SetGeom, "Sets the geometry to a given shape") DeclFunc("ext_InitGeomFromOVF", InitGeomFromOVF, "Initialize geometry, cell count and cell size given a pattern from OVF") DeclVar("EdgeSmooth", &edgeSmooth, "Geometry edge smoothing with edgeSmooth^3 samples per cell, 0=staircase, ~8=very smooth") geometry.init() } var ( geometry geom edgeSmooth int = 0 // disabled by default ) type geom struct { info buffer *data.Slice shape Shape } func (g *geom) init() { g.buffer = nil g.info = info{1, "geom", ""} DeclROnly("geom", g, "Cell fill fraction (0..1)") } func spaceFill() float64 { if geometry.Gpu().IsNil() { return 1 } else { return float64(cuda.Sum(geometry.buffer)) / float64(geometry.Mesh().NCell()) } } func (g *geom) Gpu() *data.Slice { if g.buffer == nil { g.buffer = data.NilSlice(1, g.Mesh().Size()) } return g.buffer } func (g *geom) Slice() (*data.Slice, bool) { s := g.Gpu() if s.IsNil() { s := cuda.Buffer(g.NComp(), g.Mesh().Size()) cuda.Memset(s, 1) return s, true } else { return s, false } } func (q *geom) EvalTo(dst *data.Slice) { EvalTo(q, dst) } var _ Quantity = &geometry func (g *geom) average() []float64 { s, r := g.Slice() if r { defer cuda.Recycle(s) } return sAverageUniverse(s) } func (g *geom) Average() float64 { return g.average()[0] } func SetGeom(s Shape) { geometry.setGeom(s) } func isNonEmpty(geomSlice *data.Slice) bool { arrDim := geomSlice.Size() for z := 0; z < arrDim[Z]; z++ { for y := 0; y < arrDim[Y]; y++ { for x := 0; x < arrDim[X]; x++ { //optimal empty volume check, quit first time you see non-zero value if geomSlice.Get(0, x, y, z) != 0 { return true } } } } return false } func cleanMagnetization(geomSlice *data.Slice) { // M inside geom but previously outside needs to be re-inited needupload := false geomlist := geomSlice.Host()[0] mhost := M.Buffer().HostCopy() m := mhost.Host() rng := rand.New(rand.NewSource(0)) for i := range m[0] { if geomlist[i] != 0 { mx, my, mz := m[X][i], m[Y][i], m[Z][i] if mx == 0 && my == 0 && mz == 0 { needupload = true rnd := randomDir(rng) m[X][i], m[Y][i], m[Z][i] = float32(rnd[X]), float32(rnd[Y]), float32(rnd[Z]) } } } if needupload { data.Copy(M.Buffer(), mhost) } M.normalize() // removes m outside vol } func InitGeomFromOVF(fname string) { in, err := httpfs.Open(fname) util.FatalErr(err) geomSlice, meta, _ := oommf.Read(in) arrDim := geomSlice.Size() step := meta.CellSize //check the geometry file for sanity if geomSlice.NComp() != 1 { util.Fatal("Geometry initialization file should have point dimension of 1!") } if !isNonEmpty(geomSlice) { util.Fatal("ext_InitGeomFromOVF: provided geometry is completely empty!") } //set mesh from imported file, should refresh it by itself SetMesh(arrDim[X], arrDim[Y], arrDim[Z], step[X], step[Y], step[Z], 0, 0, 0) SetBusy(true) defer SetBusy(false) //first time initialization if needed if geometry.Gpu().IsNil() { geometry.buffer = cuda.NewSlice(1, geomSlice.Size()) } //copy data into geometry array data.Copy(geometry.buffer, geomSlice) //make a makeshift function to represent imported geometry isInterpd := false pred := VoxelShape(geomSlice, step[0], step[1], step[2]) geometry.shape = func(x, y, z float64) bool { if !isInterpd { util.Log("Warning! Geometry imported through ext_InitGeomFromOVF is about to be reinterpolated! Possible changes in geometry!") isInterpd = true } return pred(x, y, z) } cleanMagnetization(geomSlice) } func (geometry *geom) setGeom(s Shape) { SetBusy(true) defer SetBusy(false) if s == nil { // TODO: would be nice not to save volume if entirely filled s = universe } geometry.shape = s if geometry.Gpu().IsNil() { geometry.buffer = cuda.NewSlice(1, geometry.Mesh().Size()) } host := data.NewSlice(1, geometry.Gpu().Size()) array := host.Scalars() V := host v := array n := geometry.Mesh().Size() c := geometry.Mesh().CellSize() cx, cy, cz := c[X], c[Y], c[Z] progress, progmax := 0, n[Y]*n[Z] var ok bool for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { progress++ util.Progress(progress, progmax, "Initializing geometry") for ix := 0; ix < n[X]; ix++ { r := Index2Coord(ix, iy, iz) x0, y0, z0 := r[X], r[Y], r[Z] // check if center and all vertices lie inside or all outside allIn, allOut := true, true if s(x0, y0, z0) { allOut = false } else { allIn = false } if edgeSmooth != 0 { // center is sufficient if we're not really smoothing for _, Δx := range []float64{-cx / 2, cx / 2} { for _, Δy := range []float64{-cy / 2, cy / 2} { for _, Δz := range []float64{-cz / 2, cz / 2} { if s(x0+Δx, y0+Δy, z0+Δz) { // inside allOut = false } else { allIn = false } } } } } switch { case allIn: v[iz][iy][ix] = 1 ok = true case allOut: v[iz][iy][ix] = 0 default: v[iz][iy][ix] = geometry.cellVolume(ix, iy, iz) ok = ok || (v[iz][iy][ix] != 0) } } } } if !ok { util.Fatal("SetGeom: geometry completely empty") } data.Copy(geometry.buffer, V) cleanMagnetization(host) } // Sample edgeSmooth^3 points inside the cell to estimate its volume. func (g *geom) cellVolume(ix, iy, iz int) float32 { r := Index2Coord(ix, iy, iz) x0, y0, z0 := r[X], r[Y], r[Z] c := geometry.Mesh().CellSize() cx, cy, cz := c[X], c[Y], c[Z] s := geometry.shape var vol float32 N := edgeSmooth S := float64(edgeSmooth) for dx := 0; dx < N; dx++ { Δx := -cx/2 + (cx / (2 * S)) + (cx/S)*float64(dx) for dy := 0; dy < N; dy++ { Δy := -cy/2 + (cy / (2 * S)) + (cy/S)*float64(dy) for dz := 0; dz < N; dz++ { Δz := -cz/2 + (cz / (2 * S)) + (cz/S)*float64(dz) if s(x0+Δx, y0+Δy, z0+Δz) { // inside vol++ } } } } return vol / float32(N*N*N) } func (g *geom) GetCell(ix, iy, iz int) float64 { return float64(cuda.GetCell(g.Gpu(), 0, ix, iy, iz)) } func (g *geom) shift(dx int) { // empty mask, nothing to do if g == nil || g.buffer.IsNil() { return } // allocated mask: shift s := g.buffer s2 := cuda.Buffer(1, g.Mesh().Size()) defer cuda.Recycle(s2) newv := float32(1) // initially fill edges with 1's cuda.ShiftX(s2, s, dx, newv, newv) data.Copy(s, s2) n := Mesh().Size() x1, x2 := shiftDirtyRange(dx, X) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := x1; ix < x2; ix++ { r := Index2Coord(ix, iy, iz) // includes shift if !g.shape(r[X], r[Y], r[Z]) { cuda.SetCell(g.buffer, 0, ix, iy, iz, 0) // a bit slowish, but hardly reached } } } } } func (g *geom) shiftY(dy int) { // empty mask, nothing to do if g == nil || g.buffer.IsNil() { return } // allocated mask: shift s := g.buffer s2 := cuda.Buffer(1, g.Mesh().Size()) defer cuda.Recycle(s2) newv := float32(1) // initially fill edges with 1's cuda.ShiftY(s2, s, dy, newv, newv) data.Copy(s, s2) n := Mesh().Size() y1, y2 := shiftDirtyRange(dy, Y) for iz := 0; iz < n[Z]; iz++ { for ix := 0; ix < n[X]; ix++ { for iy := y1; iy < y2; iy++ { r := Index2Coord(ix, iy, iz) // includes shift if !g.shape(r[X], r[Y], r[Z]) { cuda.SetCell(g.buffer, 0, ix, iy, iz, 0) // a bit slowish, but hardly reached } } } } } // range along component that needs to be refreshed after shift over d func shiftDirtyRange(d, comp int) (p1, p2 int) { n := Mesh().Size()[comp] util.Argument(d != 0) if d < 0 { p1 = n + d p2 = n } else { p1 = 0 p2 = d } return } func (g *geom) Mesh() *data.Mesh { return Mesh() } 3-3.11.1/engine/gofiles.go000066400000000000000000000032671503346766200152070ustar00rootroot00000000000000package engine // support for running Go files as if they were mx3 files. import ( "flag" "os" "path" "github.com/mumax/3/cuda" "github.com/mumax/3/util" ) var ( // These flags are shared between cmd/mumax3 and Go input files. Flag_cachedir = flag.String("cache", os.TempDir(), "Kernel cache directory (empty disables caching)") Flag_gpu = flag.Int("gpu", 0, "Specify a single GPU (use CUDA_AVAILABLE_DEVICES environment variable for advanced selection)") Flag_interactive = flag.Bool("i", false, "Open interactive browser session") Flag_od = flag.String("o", "", "Override output directory") Flag_port = flag.String("http", ":35367", "Port to serve web gui") Flag_selftest = flag.Bool("paranoid", false, "Enable convolution self-test for cuFFT sanity.") Flag_silent = flag.Bool("s", false, "Silent") // provided for backwards compatibility Flag_sync = flag.Bool("sync", false, "Synchronize all CUDA calls (debug)") Flag_forceclean = flag.Bool("f", false, "Force start, clean existing output directory") ) func FlagPassed(name string) bool { passed := false flag.Visit(func(f *flag.Flag) { if f.Name == name { passed = true } }) return passed } // Usage: in every Go input file, write: // // func main(){ // defer InitAndClose()() // // ... // } // // This initialises the GPU, output directory, etc, // and makes sure pending output will get flushed. func InitAndClose() func() { flag.Parse() cuda.Init(*Flag_gpu) cuda.Synchronous = *Flag_sync od := *Flag_od if od == "" { od = path.Base(os.Args[0]) + ".out" } inFile := util.NoExt(od) InitIO(inFile, od, *Flag_forceclean) GoServe(*Flag_port) return func() { Close() } } 3-3.11.1/engine/gui.go000066400000000000000000000364241503346766200143440ustar00rootroot00000000000000package engine import ( "fmt" "math/rand" "net" "net/http" "path" "reflect" "strconv" "sync" "time" "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/gui" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) // global GUI state stores what is currently shown in the web page. var ( gui_ = guistate{Quants: make(map[string]Quantity), Params: make(map[string]Param)} Timeout = 3 * time.Second // exit finished simulation this long after browser was closed ) type guistate struct { *gui.Page // GUI elements (buttons...) Quants map[string]Quantity // displayable quantities by name Params map[string]Param // displayable parameters by name render // renders displayed quantity mutex sync.Mutex // protects eventCacheBreaker and keepalive _eventCacheBreaker int // changed on any event to make sure display is updated keepalive time.Time } // Returns the time when updateKeepAlive was called. func (g *guistate) KeepAlive() time.Time { g.mutex.Lock() defer g.mutex.Unlock() return g.keepalive } // Called on each http request to signal browser is still open. func (g *guistate) UpdateKeepAlive() { g.mutex.Lock() defer g.mutex.Unlock() g.keepalive = time.Now() } func nop() {} // Enter interactive mode. Simulation is now exclusively controlled by web GUI func (g *guistate) RunInteractive() { // periodically wake up Run so it may exit on timeout go func() { for { Inject <- nop time.Sleep(1 * time.Second) } }() fmt.Println("//entering interactive mode") g.UpdateKeepAlive() for time.Since(g.KeepAlive()) < Timeout { f := <-Inject f() } fmt.Println("//browser disconnected, exiting") } // displayable quantity in GUI Parameters section type Param interface { NComp() int Name() string Unit() string getRegion(int) []float64 IsUniform() bool } func GUIAdd(name string, value interface{}) { gui_.Add(name, value) } // Internal:add a quantity to the GUI, will be visible in web interface. // Automatically called by Decl*(), still before PrepareServer() func (g *guistate) Add(name string, value interface{}) { if v, ok := value.(Param); ok { g.Params[name] = v } if v, ok := value.(Quantity); ok { g.Quants[name] = v } } // Once Params/Quants have been declared and added, // initialize the GUI Page (pre-renders template) and register http handlers func (g *guistate) PrepareServer() { g.Page = gui.NewPage(templText, g) util.SetProgress(gui_.Prog) g.OnAnyEvent(func() { g.incCacheBreaker() }) http.Handle("/", g) http.HandleFunc("/render/", g.ServeRender) http.HandleFunc("/plot/", g.servePlot) g.Set("title", util.NoExt(OD()[:len(OD())-1])) g.prepareConsole() g.prepareMesh() g.prepareGeom() g.prepareM() g.prepareSolver() g.prepareDisplay() g.prepareParam() g.prepareOnUpdate() } // see prepareServer func (g *guistate) prepareConsole() { g.OnEvent("cli", func() { cmd := g.StringValue("cli") Inject <- func() { g.EvalGUI(cmd) } g.Set("cli", "") }) } // see prepareServer func (g *guistate) prepareMesh() { //g.Disable("setmesh", true) // button only enabled if pressing makes sense const MESHWARN = "⚠ Click to update mesh (may take some time)" warnmesh := func() { //g.Disable("setmesh", false) g.Set("setmeshwarn", MESHWARN) } g.OnEvent("nx", func() { Inject <- func() { lazy_gridsize[X] = g.IntValue("nx"); warnmesh() } }) g.OnEvent("ny", func() { Inject <- func() { lazy_gridsize[Y] = g.IntValue("ny"); warnmesh() } }) g.OnEvent("nz", func() { Inject <- func() { lazy_gridsize[Z] = g.IntValue("nz"); warnmesh() } }) g.OnEvent("cx", func() { Inject <- func() { lazy_cellsize[X] = g.FloatValue("cx"); warnmesh() } }) g.OnEvent("cy", func() { Inject <- func() { lazy_cellsize[Y] = g.FloatValue("cy"); warnmesh() } }) g.OnEvent("cz", func() { Inject <- func() { lazy_cellsize[Z] = g.FloatValue("cz"); warnmesh() } }) g.OnEvent("px", func() { Inject <- func() { lazy_pbc[X] = g.IntValue("px"); warnmesh() } }) g.OnEvent("py", func() { Inject <- func() { lazy_pbc[Y] = g.IntValue("py"); warnmesh() } }) g.OnEvent("pz", func() { Inject <- func() { lazy_pbc[Z] = g.IntValue("pz"); warnmesh() } }) g.OnEvent("setmesh", func() { //g.Disable("setmesh", true) Inject <- (func() { g.EvalGUI(fmt.Sprintf("SetMesh(%v, %v, %v, %v, %v, %v, %v, %v, %v)", g.Value("nx"), g.Value("ny"), g.Value("nz"), g.Value("cx"), g.Value("cy"), g.Value("cz"), g.Value("px"), g.Value("py"), g.Value("pz"))) // update lazy_* sizes to be up-to date with proper mesh n := Mesh().Size() c := Mesh().CellSize() p := Mesh().PBC() lazy_gridsize = []int{n[X], n[Y], n[Z]} lazy_cellsize = []float64{c[X], c[Y], c[Z]} lazy_pbc = []int{p[X], p[Y], p[Z]} }) g.Set("setmeshwarn", "mesh up to date") }) } func (g *guistate) IntValue(id string) int { s := g.StringValue(id) r := fmt.Sprint(Eval1Line(s)) i, _ := strconv.Atoi(r) return i } func (g *guistate) FloatValue(id string) float64 { s := g.StringValue(id) r := fmt.Sprint(Eval1Line(s)) f, _ := strconv.ParseFloat(r, 64) return f } // see prepareServer func (g *guistate) prepareGeom() { g.OnEvent("geomselect", func() { ident := g.StringValue("geomselect") t := World.Resolve(ident).Type() // set sensible args: world size args := "(" for i := 0; i < t.NumIn(); i++ { val := 0.0 if i < 3 { val = Mesh().WorldSize()[i] } if i > 0 { args += ", " } args += fmt.Sprint(val) } args += ")" // overwrite args for special cases switch { case ident == "Cell": args = "(0, 0, 0)" case ident == "XRange" || ident == "YRange" || ident == "ZRange": args = "(0, inf)" case ident == "Layers": args = "(0, 1)" case ident == "ImageShape": args = `("filename.png")` } g.Set("geomargs", args) g.Set("geomdoc", g.Doc(ident)) }) g.OnEvent("setgeom", func() { Inject <- (func() { g.EvalGUI(fmt.Sprint("SetGeom(", g.StringValue("geomselect"), g.StringValue("geomargs"), ")")) }) }) } // see prepareServer func (g *guistate) prepareM() { g.OnEvent("mselect", func() { ident := g.StringValue("mselect") t := World.Resolve(ident).Type() args := "(" for i := 0; i < t.NumIn(); i++ { if i > 0 { args += ", " } args += "1" } args += ")" // overwrite args for special cases switch ident { case "VortexWall": args = "(1, -1, 1, 1)" } g.Set("margs", args) g.Set("mdoc", g.Doc(ident)) }) g.OnEvent("setm", func() { Inject <- (func() { g.EvalGUI(fmt.Sprint("m = ", g.StringValue("mselect"), g.StringValue("margs"))) }) }) } var ( solvertypes = map[string]int{"bw_euler": -1, "euler": 1, "heun": 2, "rk23": 3, "rk4": 4, "rk45": 5, "rkf56": 6} solvernames = map[int]string{-1: "bw_euler", 1: "euler", 2: "heun", 3: "rk23", 4: "rk4", 5: "rk45", 6: "rkf56"} ) func Break() { Inject <- func() { pause = true } } // see prepareServer func (g *guistate) prepareSolver() { g.OnEvent("run", func() { Break(); Inject <- func() { g.EvalGUI(sprint("Run(", g.StringValue("runtime"), ")")) } }) g.OnEvent("steps", func() { Break(); Inject <- func() { g.EvalGUI(sprint("Steps(", g.StringValue("runsteps"), ")")) } }) g.OnEvent("break", Break) g.OnEvent("relax", func() { Break(); Inject <- func() { g.EvalGUI("relax()") } }) g.OnEvent("mindt", func() { Inject <- func() { g.EvalGUI("MinDt=" + g.StringValue("mindt")) } }) g.OnEvent("maxdt", func() { Inject <- func() { g.EvalGUI("MaxDt=" + g.StringValue("maxdt")) } }) g.OnEvent("fixdt", func() { Inject <- func() { g.EvalGUI("FixDt=" + g.StringValue("fixdt")) } }) g.OnEvent("maxerr", func() { Inject <- func() { g.EvalGUI("MaxErr=" + g.StringValue("maxerr")) } }) g.OnEvent("solvertype", func() { Inject <- func() { typ := solvertypes[g.StringValue("solvertype")] // euler must have fixed time step if typ == EULER && FixDt == 0 { g.EvalGUI("FixDt = 1e-15") } if typ == BACKWARD_EULER && FixDt == 0 { g.EvalGUI("FixDt = 1e-13") } g.EvalGUI(fmt.Sprint("SetSolver(", typ, ")")) } }) } // see prepareServer func (g *guistate) prepareParam() { for _, p := range g.Params { p := p n := p.Name() g.OnEvent(n, func() { cmd := p.Name() r := g.Value("region") if r == -1 { cmd += " = " } else { cmd += fmt.Sprint(".SetRegion(", r, ", ") } if p.NComp() == 3 { cmd += "vector " // space needed } cmd += g.StringValue(p.Name()) if r != -1 { cmd += ")" } Inject <- func() { g.EvalGUI(cmd) } }) } // overwrite handler for temperature // do not crash when we enter bogus values (see temperature.go) g.OnEvent("Temp", func() { Inject <- func() { if FixDt == 0 { g.EvalGUI("FixDt = 10e-14") // finite temperature requires fixed time step } g.EvalGUI("Temp = " + g.StringValue("Temp")) } }) } // see prepareServer func (g *guistate) prepareDisplay() { // plot g.OnEvent("tableAutoSave", func() { Inject <- func() { g.EvalGUI("TableAutosave(" + g.StringValue("tableAutoSave") + ")") } }) // render g.OnEvent("renderQuant", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() name := g.StringValue("renderQuant") q := g.Quants[name] if q == nil { LogErr("display: unknown quantity:", name) return } g.render.quant = q g.Set("renderDoc", g.Doc(g.StringValue("renderQuant"))) }) g.OnEvent("renderComp", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.comp = g.StringValue("renderComp") // TODO: set to "" if q.Ncomp < 3 }) g.OnEvent("renderLayer", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.layer = g.IntValue("renderLayer") g.Set("renderLayerLabel", fmt.Sprint(g.render.layer, "/", Mesh().Size()[Z])) }) g.OnEvent("renderScale", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.scale = maxScale - g.IntValue("renderScale") g.Set("renderScaleLabel", fmt.Sprint("1/", g.render.scale)) }) } // see prepareServer func (g *guistate) prepareOnUpdate() { g.OnUpdate(func() { g.UpdateKeepAlive() // keep track of when browser was last seen alive if GetBusy() { // busy, e.g., calculating kernel, run loop will not accept commands. return } Inject <- (func() { // sends to run loop to be executed in between time steps g.Set("console", hist) // mesh g.Set("nx", lazy_gridsize[X]) g.Set("ny", lazy_gridsize[Y]) g.Set("nz", lazy_gridsize[Z]) g.Set("cx", lazy_cellsize[X]) g.Set("cy", lazy_cellsize[Y]) g.Set("cz", lazy_cellsize[Z]) g.Set("px", lazy_pbc[X]) g.Set("py", lazy_pbc[Y]) g.Set("pz", lazy_pbc[Z]) g.Set("wx", printf(lazy_cellsize[X]*float64(lazy_gridsize[X])*1e9)) g.Set("wy", printf(lazy_cellsize[Y]*float64(lazy_gridsize[Y])*1e9)) g.Set("wz", printf(lazy_cellsize[Z]*float64(lazy_gridsize[Z])*1e9)) // solver g.Set("nsteps", NSteps) g.Set("time", fmt.Sprintf("%1.5e", Time)) g.Set("dt", fmt.Sprintf("%1.3e", Dt_si)) g.Set("lasterr", fmt.Sprintf("%1.3e", LastErr)) g.Set("maxerr", MaxErr) g.Set("mindt", MinDt) g.Set("maxdt", MaxDt) g.Set("fixdt", FixDt) g.Set("solvertype", fmt.Sprint(solvernames[solvertype])) if pause { g.Set("busy", "Paused") } else { g.Set("busy", "Running") // Don't re-evaluate all the time if not running g.Set("maxtorque", fmt.Sprintf("%1.3e T", LastTorque)) } // display g.Set("tableAutoSave", Table.autosave.period) quant := g.StringValue("renderQuant") comp := g.StringValue("renderComp") cachebreaker := "?" + g.StringValue("nsteps") + "_" + fmt.Sprint(g.cacheBreaker()) g.Attr("renderLayer", "max", Mesh().Size()[Z]-1) g.Set("display", "/render/"+quant+"/"+comp+cachebreaker) // plot gui_.Set("plot", "/plot/"+cachebreaker) // parameters for _, p := range g.Params { n := p.Name() r := g.IntValue("region") if r == -1 && !p.IsUniform() { g.Set(n, "") } else { if r == -1 { r = 0 // uniform, so pick one } v := p.getRegion(r) if p.NComp() == 1 { g.Set(n, float32(v[0])) } else { g.Set(n, fmt.Sprintf("(%v, %v, %v)", float32(v[X]), float32(v[Y]), float32(v[Z]))) } } } // gpu memfree, _ := cu.MemGetInfo() memfree /= (1024 * 1024) g.Set("memfree", memfree) }) }) } // Returns documentation string for quantity name. E.g.: // // "m" -> "Reduced magnetization" func (g *guistate) Doc(quant string) string { doc, ok := World.Doc[quant] if !ok { LogErr("no doc for", quant) } return doc } // Returns unit for quantity name. E.g.: // // "Msat" -> "A/m" func (g *guistate) UnitOf(quant string) string { p := g.Params[quant] if p != nil { return p.Unit() } else { return "" } } // renders page title for PrepareServer func (g *guistate) Title() string { return util.NoExt(path.Base(OD())) } func (g *guistate) Version() string { return UNAME } func (g *guistate) GPUInfo() string { return cuda.GPUInfo } func (g *guistate) incCacheBreaker() { g.mutex.Lock() defer g.mutex.Unlock() g._eventCacheBreaker++ } func (g *guistate) cacheBreaker() int { g.mutex.Lock() defer g.mutex.Unlock() return g._eventCacheBreaker } func (g *guistate) QuantNames() []string { names := make([]string, 0, len(g.Quants)) for k := range g.Quants { names = append(names, k) } sortNoCase(names) return names } // List all available shapes func (g *guistate) Shapes() []string { return g.apifilter("Shape") } func (g *guistate) Configs() []string { return g.apifilter("Config") } // List all api functions that return outputtype (Shape, Config, ...) func (g *guistate) apifilter(outputtype string) []string { var match []string for k := range World.Doc { v := World.Resolve(k) t := v.Type() if t.Kind() == reflect.Func && t.NumOut() == 1 && t.Out(0).Name() == outputtype { match = append(match, k) } } sortNoCase(match) return match } func (g *guistate) Parameters() []string { var params []string for _, v := range g.Params { params = append(params, v.Name()) } sortNoCase(params) return params } // renders a
    that toggles visibility on click for PrepareServer func (g *guistate) Div(heading string) string { id := fmt.Sprint("div_", rand.Int()) return fmt.Sprintf(`▾ %v
    `, id, heading, id) } func GoServe(addr string) string { gui_.PrepareServer() // find a free port starting from the usual number l, err := net.Listen("tcp", addr) for err != nil { h, p, _ := net.SplitHostPort(addr) addr = fmt.Sprint(h, ":", atoi(p)+1) l, err = net.Listen("tcp", addr) } go func() { LogErr(http.Serve(l, nil)) }() httpfs.Put(OD()+"gui", []byte(l.Addr().String())) return addr } func atoi(a string) int { i, err := strconv.Atoi(a) util.PanicErr(err) return i } // Prog advances the GUI progress bar to fraction a/total and displays message. func (g *guistate) Prog(a, total int, msg string) { g.Set("progress", (a*100)/total) g.Set("busy", msg) util.PrintProgress(a, total, msg) } // Eval code + update keepalive in case the code runs long func (g *guistate) EvalGUI(code string) { defer func() { if err := recover(); err != nil { if userErr, ok := err.(UserErr); ok { LogErr(userErr) } else { panic(err) } } }() Eval(code) g.UpdateKeepAlive() } // //// round duration to 1s accuracy //func roundt(t time.Duration) time.Duration { // return t - t%1e9 //} // 3-3.11.1/engine/heun.go000066400000000000000000000020531503346766200145060ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/cuda" "github.com/mumax/3/util" ) // Adaptive Heun solver. type Heun struct{} // Adaptive Heun method, can be used as solver.Step func (*Heun) Step() { y := M.Buffer() dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) if FixDt != 0 { Dt_si = FixDt } dt := float32(Dt_si * GammaLL) util.Assert(dt > 0) // stage 1 torqueFn(dy0) cuda.Madd2(y, y, dy0, 1, dt) // y = y + dt * dy // stage 2 dy := cuda.Buffer(3, y.Size()) defer cuda.Recycle(dy) Time += Dt_si torqueFn(dy) err := cuda.MaxVecDiff(dy0, dy) * float64(dt) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK cuda.Madd3(y, y, dy, dy0, 1, 0.5*dt, -0.5*dt) M.normalize() NSteps++ adaptDt(math.Pow(MaxErr/err, 1./2.)) setLastErr(err) setMaxTorque(dy) } else { // undo bad step util.Assert(FixDt == 0) Time -= Dt_si cuda.Madd2(y, y, dy0, 1, -dt) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./3.)) } } func (*Heun) Free() {} 3-3.11.1/engine/html.go000066400000000000000000000245301503346766200145170ustar00rootroot00000000000000package engine const templText = ` mumax3 ` + CSS + ` {{.JS}} {{.Span "title" "mumax3"}}   {{.Progress "progress" 100 0}} {{.Span "busy" "" }}   {{.ErrorBox}}

    {{.Data.Div "console"}} {{.Console "console" 16 84 "" "onfocus=\"console_focus=true\"" "onblur=\"console_focus=false\"" "onmouseover=\"console_focus=true\"" "onmouseout=\"console_focus=false\"" "readonly" "style=\"font-family:monospace; font-size:0.8em;\"" }}
    {{.CliBox "cli" "" "onkeydown=\"clikeydown(event);\"" "placeholder=\"type commands here, or up/down\"" "size=86" "style=\"font-family:monospace; font-size:0.8em;\"" }}
    {{.Data.Div "mesh"}}
    gridsize: {{.TextBox "nx" "" "size=8"}} × {{.TextBox "ny" "" "size=8"}} × {{.TextBox "nz" "" "size=8"}} cells
    cellsize: {{.TextBox "cx" "" "size=8"}} × {{.TextBox "cy" "" "size=8"}} × {{.TextBox "cz" "" "size=8"}} m3
    PBC: {{.TextBox "px" "" "size=8"}} × {{.TextBox "py" "" "size=8"}} × {{.TextBox "pz" "" "size=8"}} repetitions
    worldsize: {{.Span "wx" ""}} × {{.Span "wy" ""}} × {{.Span "wz" ""}} nm3
    {{.Button "setmesh" "update"}} {{.Span "setmeshwarn" ""}}
    {{.Data.Div "geometry"}} SetGeom( {{.Data.Shapes | .SelectArray "geomselect" "Universe"}} {{.TextBox "geomargs" "()" }} ) {{.Button "setgeom" "Set"}}
    {{.Span "geomdoc" "" "style=\"color:gray\""}} {{.Data.Div "initial m"}} m = {{.Data.Configs | .SelectArray "mselect" "Uniform"}} {{.TextBox "margs" "(1, 0, 0)" }} {{.Button "setm" "Set"}}
    {{.Span "mdoc" "" "style=\"color:gray\""}} {{.Data.Div "solver"}} Type: {{.Select "solvertype" "rk45" "bw_euler" "euler" "heun" "rk4" "rk23" "rk45" "rkf56"}}
    {{.Button "run" "Run" }} {{.TextBox "runtime" 1e-9 "size=8"}}s
    {{.Button "steps" "Steps"}} {{.TextBox "runsteps" "1000" "size=8"}}
    {{.Button "relax" "Relax"}}
    {{.Button "break" "Break"}}
           
    step: {{.Span "nsteps" "0"}}
    time: {{.Span "time" "0"}} s
    dt: {{.Span "dt" "0"}} s
    err/step: {{.Span "lasterr" "0"}}
    MaxTorque:{{.Span "maxtorque" "--"}}
           
    fixdt: {{.TextBox "fixdt" "0" "size=8"}} s
    mindt: {{.TextBox "mindt" "0" "size=8"}} s
    maxdt: {{.TextBox "maxdt" "0" "size=8"}} s
    maxerr: {{.TextBox "maxerr" "0" "size=8"}}/step
    {{.Data.Div "display"}}

    Quantity: {{.Data.QuantNames | .SelectArray "renderQuant" "m"}} {{.Select "renderComp" "" "" "x" "y" "z"}} {{.Span "renderDoc" "" "style=\"color:gray\""}}
    Slice: {{.Range "renderLayer" 0 0 0 }} {{.Span "renderLayerLabel" "0"}}
    Scale: {{.Range "renderScale" 0 31 31}} {{.Span "renderScaleLabel" "1/1"}}

    {{.Img "display" "/render/m" "alt=\"display\""}}

    {{.Data.Div "gnuplot"}}

    TableAutosave: {{.TextBox "tableAutoSave" "0" }} s

    Plot of "table.txt", provided table is being autosaved and gnuplot installed.
    plot "table.txt" using {{.TextBox "usingx" "1"}} : {{.TextBox "usingy" "2"}} with lines

    {{.Span "plotErr" ""}}

    {{.Img "plot" "/plot/"}} {{.Data.Div "parameters"}} Region: {{.Number "region" -1 255 -1}}
    {{range .Data.Parameters}} {{end}}
    {{.}} {{$.TextBox . ""}} {{$.Data.UnitOf . }}

    {{.Data.Version}}
    {{.Data.GPUInfo}} ({{.Span "memfree" ""}} MB free)
    © 2013 Arne Vansteenkiste, DyNaMat LAB, UGent.
    ` const CSS = ` ` 3-3.11.1/engine/log.go000066400000000000000000000031601503346766200143300ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" "io" "os" ) var ( hist string // console history for GUI logfile io.WriteCloser // saves history of input commands + output ) // Special error that is not fatal when paniced on and called from GUI // E.g.: try to set bad grid size: panic on UserErr, recover, print error, carry on. type UserErr string func (e UserErr) Error() string { return string(e) } func CheckRecoverable(err error) { if err != nil { panic(UserErr(err.Error())) } } func LogIn(msg ...interface{}) { str := sprint(msg...) log2GUI(str) log2File(str) fmt.Println(str) } func LogOut(msg ...interface{}) { str := "//" + sprint(msg...) log2GUI(str) log2File(str) fmt.Println(str) } func LogErr(msg ...interface{}) { str := "//" + sprint(msg...) log2GUI(str) log2File(str) fprintln(os.Stderr, str) } func log2File(msg string) { if logfile != nil { fprintln(logfile, msg) } } func initLog() { if logfile != nil { panic("log already inited") } // open log file and flush what was logged before the file existed var err error logfile, err = httpfs.Create(OD() + "log.txt") if err != nil { panic(err) } util.FatalErr(err) logfile.Write(([]byte)(hist)) logfile.Write([]byte{'\n'}) } func log2GUI(msg string) { if len(msg) > 1000 { msg = msg[:1000-len("...")] + "..." } if hist != "" { // prepend newline hist += "\n" } hist += msg // TODO: push to web ? } // like fmt.Sprint but with spaces between args func sprint(msg ...interface{}) string { str := fmt.Sprintln(msg...) str = str[:len(str)-1] // strip newline return str } 3-3.11.1/engine/lutdata.go000066400000000000000000000046721503346766200152160ustar00rootroot00000000000000package engine import ( "unsafe" "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // look-up table for region based parameters type lut struct { gpu_buf cuda.LUTPtrs // gpu copy of cpu buffer, only transferred when needed gpu_ok bool // gpu cache up-to date with cpu source? cpu_buf [][NREGION]float32 // table data on cpu source updater // updates cpu data } type updater interface { update() // updates cpu lookup table } func (p *lut) init(nComp int, source updater) { p.gpu_buf = make(cuda.LUTPtrs, nComp) p.cpu_buf = make([][NREGION]float32, nComp) p.source = source } // get an up-to-date version of the lookup-table on CPU func (p *lut) cpuLUT() [][NREGION]float32 { p.source.update() return p.cpu_buf } // get an up-to-date version of the lookup-table on GPU func (p *lut) gpuLUT() cuda.LUTPtrs { p.source.update() if !p.gpu_ok { // upload to GPU p.assureAlloc() cuda.Sync() // sync previous kernels, may still be using gpu lut for c := range p.gpu_buf { cuda.MemCpyHtoD(p.gpu_buf[c], unsafe.Pointer(&p.cpu_buf[c][0]), cu.SIZEOF_FLOAT32*NREGION) } p.gpu_ok = true cuda.Sync() //sync upload } return p.gpu_buf } // utility for LUT of single-component data func (p *lut) gpuLUT1() cuda.LUTPtr { util.Assert(len(p.gpu_buf) == 1) return cuda.LUTPtr(p.gpuLUT()[0]) } // all data is 0? func (p *lut) isZero() bool { v := p.cpuLUT() for c := range v { for i := 0; i < NREGION; i++ { if v[c][i] != 0 { return false } } } return true } func (p *lut) nonZero() bool { return !p.isZero() } // some data is 0? func (p *lut) hasZero() bool { v := p.cpuLUT() for c := range v { for i := 0; i < NREGION; i++ { if v[c][i] == 0 { return true } } } return false } func (p *lut) assureAlloc() { if p.gpu_buf[0] == nil { for i := range p.gpu_buf { p.gpu_buf[i] = cuda.MemAlloc(NREGION * cu.SIZEOF_FLOAT32) } } } func (b *lut) NComp() int { return len(b.cpu_buf) } // uncompress the table to a full array with parameter values per cell. func (p *lut) Slice() (*data.Slice, bool) { b := cuda.Buffer(p.NComp(), Mesh().Size()) p.EvalTo(b) return b, true } // uncompress the table to a full array in the dst Slice with parameter values per cell. func (p *lut) EvalTo(dst *data.Slice) { gpu := p.gpuLUT() for c := 0; c < p.NComp(); c++ { cuda.RegionDecode(dst.Comp(c), cuda.LUTPtr(gpu[c]), regions.Gpu()) } } 3-3.11.1/engine/magnetization.go000066400000000000000000000104471503346766200164260ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "reflect" ) var M magnetization // reduced magnetization (unit length) func init() { DeclLValue("m", &M, `Reduced magnetization (unit length)`) } // Special buffered quantity to store magnetization // makes sure it's normalized etc. type magnetization struct { buffer_ *data.Slice } func (m *magnetization) Mesh() *data.Mesh { return Mesh() } func (m *magnetization) NComp() int { return 3 } func (m *magnetization) Name() string { return "m" } func (m *magnetization) Unit() string { return "" } func (m *magnetization) Buffer() *data.Slice { return m.buffer_ } // todo: rename Gpu()? func (m *magnetization) Comp(c int) ScalarField { return Comp(m, c) } func (m *magnetization) SetValue(v interface{}) { m.SetInShape(nil, v.(Config)) } func (m *magnetization) InputType() reflect.Type { return reflect.TypeOf(Config(nil)) } func (m *magnetization) Type() reflect.Type { return reflect.TypeOf(new(magnetization)) } func (m *magnetization) Eval() interface{} { return m } func (m *magnetization) average() []float64 { return sAverageMagnet(M.Buffer()) } func (m *magnetization) Average() data.Vector { return unslice(m.average()) } func (m *magnetization) normalize() { cuda.Normalize(m.Buffer(), geometry.Gpu()) } // allocate storage (not done by init, as mesh size may not yet be known then) func (m *magnetization) alloc() { m.buffer_ = cuda.NewSlice(3, m.Mesh().Size()) m.Set(RandomMag()) // sane starting config } func (b *magnetization) SetArray(src *data.Slice) { if src.Size() != b.Mesh().Size() { src = data.Resample(src, b.Mesh().Size()) } data.Copy(b.Buffer(), src) b.normalize() } func (m *magnetization) Set(c Config) { checkMesh() m.SetInShape(nil, c) } func (m *magnetization) LoadFile(fname string) { m.SetArray(LoadFile(fname)) } func (m *magnetization) Slice() (s *data.Slice, recycle bool) { return m.Buffer(), false } func (m *magnetization) EvalTo(dst *data.Slice) { data.Copy(dst, m.buffer_) } func (m *magnetization) Region(r int) *vOneReg { return vOneRegion(m, r) } func (m *magnetization) String() string { return util.Sprint(m.Buffer().HostCopy()) } // Set the value of one cell. func (m *magnetization) SetCell(ix, iy, iz int, v data.Vector) { r := Index2Coord(ix, iy, iz) if geometry.shape != nil && !geometry.shape(r[X], r[Y], r[Z]) { return } vNorm := v.Len() for c := 0; c < 3; c++ { cuda.SetCell(m.Buffer(), c, ix, iy, iz, float32(v[c]/vNorm)) } } // Get the value of one cell. func (m *magnetization) GetCell(ix, iy, iz int) data.Vector { mx := float64(cuda.GetCell(m.Buffer(), X, ix, iy, iz)) my := float64(cuda.GetCell(m.Buffer(), Y, ix, iy, iz)) mz := float64(cuda.GetCell(m.Buffer(), Z, ix, iy, iz)) return Vector(mx, my, mz) } func (m *magnetization) Quantity() []float64 { return slice(m.Average()) } // Sets the magnetization inside the shape func (m *magnetization) SetInShape(region Shape, conf Config) { checkMesh() if region == nil { region = universe } host := m.Buffer().HostCopy() h := host.Vectors() n := m.Mesh().Size() for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { r := Index2Coord(ix, iy, iz) x, y, z := r[X], r[Y], r[Z] if region(x, y, z) { // inside m := conf(x, y, z) h[X][iz][iy][ix] = float32(m[X]) h[Y][iz][iy][ix] = float32(m[Y]) h[Z][iz][iy][ix] = float32(m[Z]) } } } } m.SetArray(host) } // set m to config in region func (m *magnetization) SetRegion(region int, conf Config) { host := m.Buffer().HostCopy() h := host.Vectors() n := m.Mesh().Size() r := byte(region) regionsArr := regions.HostArray() for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { pos := Index2Coord(ix, iy, iz) x, y, z := pos[X], pos[Y], pos[Z] if regionsArr[iz][iy][ix] == r { m := conf(x, y, z) h[X][iz][iy][ix] = float32(m[X]) h[Y][iz][iy][ix] = float32(m[Y]) h[Z][iz][iy][ix] = float32(m[Z]) } } } } m.SetArray(host) } func (m *magnetization) resize() { backup := m.Buffer().HostCopy() s2 := Mesh().Size() resized := data.Resample(backup, s2) m.buffer_.Free() m.buffer_ = cuda.NewSlice(VECTOR, s2) data.Copy(m.buffer_, resized) } 3-3.11.1/engine/maxangle.go000066400000000000000000000010751503346766200153460ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( MaxAngle = NewScalarValue("MaxAngle", "rad", "maximum angle between neighboring spins", GetMaxAngle) SpinAngle = NewScalarField("spinAngle", "rad", "Angle between neighboring spins", SetSpinAngle) ) func SetSpinAngle(dst *data.Slice) { cuda.SetMaxAngle(dst, M.Buffer(), lex2.Gpu(), regions.Gpu(), M.Mesh()) } func GetMaxAngle() float64 { s := ValueOf(SpinAngle) defer cuda.Recycle(s) return float64(cuda.MaxAbs(s)) // just a max would be fine, but not currently implemented } 3-3.11.1/engine/mesh.go000066400000000000000000000106271503346766200145110ustar00rootroot00000000000000package engine import ( "fmt" "slices" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var globalmesh_ data.Mesh // mesh for m and everything that has the same size func init() { DeclFunc("SetGridSize", SetGridSize, `Sets the number of cells for X,Y,Z`) DeclFunc("SetCellSize", SetCellSize, `Sets the X,Y,Z cell size in meters`) DeclFunc("SetMesh", SetMesh, `Sets GridSize, CellSize and PBC at the same time`) DeclFunc("SetPBC", SetPBC, "Sets the number of repetitions in X,Y,Z to create periodic boundary "+ "conditions. The number of repetitions determines the cutoff range for the demagnetization.") } func Mesh() *data.Mesh { checkMesh() return &globalmesh_ } func arg(msg string, test bool) { if !test { panic(UserErr(msg + ": illegal arugment")) } } // Set the simulation mesh to Nx x Ny x Nz cells of given size. // Can be set only once at the beginning of the simulation. // TODO: dedup arguments from globals func SetMesh(Nx, Ny, Nz int, cellSizeX, cellSizeY, cellSizeZ float64, pbcx, pbcy, pbcz int) { SetBusy(true) defer SetBusy(false) arg("GridSize", Nx > 0 && Ny > 0 && Nz > 0) arg("CellSize", cellSizeX > 0 && cellSizeY > 0 && cellSizeZ > 0) arg("PBC", pbcx >= 0 && pbcy >= 0 && pbcz >= 0) sizeChanged := globalmesh_.Size() != [3]int{Nx, Ny, Nz} cellSizeChanged := globalmesh_.CellSize() != [3]float64{cellSizeX, cellSizeY, cellSizeZ} pbc := []int{pbcx, pbcy, pbcz} if sizeChanged { warnStr := "// WARNING: %s-axis is not 7-smooth. It has %d cells, with prime\n" + "// factors %v, at least one of which is greater than 7.\n" + "// Prime factors >7 may reduce performance significantly, and\n" + "// prime factors >127 may cause a CUDA_ERROR_INVALID_VALUE error." if factorsx := primeFactors(Nx); slices.Max(factorsx) > 7 { util.Log(fmt.Sprintf(warnStr, "x", Nx, factorsx)) } if factorsy := primeFactors(Ny); slices.Max(factorsy) > 7 { util.Log(fmt.Sprintf(warnStr, "y", Ny, factorsy)) } if factorsz := primeFactors(Nz); slices.Max(factorsz) > 7 { util.Log(fmt.Sprintf(warnStr, "z", Nz, factorsz)) } } if cellSizeChanged { warnStr := "// WARNING: cell size was set to a high aspect ratio.\n" + "// Calculation of demag kernel may take longer than usual." if min(cellSizeX, cellSizeY, cellSizeZ) < max(cellSizeX, cellSizeY, cellSizeZ)/4 { util.Log(warnStr) } } if globalmesh_.Size() == [3]int{0, 0, 0} { // first time mesh is set globalmesh_ = *data.NewMesh(Nx, Ny, Nz, cellSizeX, cellSizeY, cellSizeZ, pbc...) M.alloc() regions.alloc() } else { // here be dragons LogOut("resizing...") // free everything to trigger kernel recalculation, etc conv_.Free() conv_ = nil mfmconv_.Free() mfmconv_ = nil cuda.FreeBuffers() // resize everything globalmesh_ = *data.NewMesh(Nx, Ny, Nz, cellSizeX, cellSizeY, cellSizeZ, pbc...) if sizeChanged || cellSizeChanged { M.resize() regions.resize() geometry.buffer.Free() geometry.buffer = data.NilSlice(1, Mesh().Size()) geometry.setGeom(geometry.shape) // remove excitation extra terms if they don't fit anymore // up to the user to add them again B_ext.RemoveExtraTerms() J.RemoveExtraTerms() B_therm.noise.Free() B_therm.noise = nil } } lazy_gridsize = []int{Nx, Ny, Nz} lazy_cellsize = []float64{cellSizeX, cellSizeY, cellSizeZ} lazy_pbc = []int{pbcx, pbcy, pbcz} } func printf(f float64) float32 { return float32(f) } // for lazy setmesh: set gridsize and cellsize in separate calls var ( lazy_gridsize []int lazy_cellsize []float64 lazy_pbc = []int{0, 0, 0} ) func SetGridSize(Nx, Ny, Nz int) { lazy_gridsize = []int{Nx, Ny, Nz} if lazy_cellsize != nil { SetMesh(Nx, Ny, Nz, lazy_cellsize[X], lazy_cellsize[Y], lazy_cellsize[Z], lazy_pbc[X], lazy_pbc[Y], lazy_pbc[Z]) } } func SetCellSize(cx, cy, cz float64) { lazy_cellsize = []float64{cx, cy, cz} if lazy_gridsize != nil { SetMesh(lazy_gridsize[X], lazy_gridsize[Y], lazy_gridsize[Z], cx, cy, cz, lazy_pbc[X], lazy_pbc[Y], lazy_pbc[Z]) } } func SetPBC(nx, ny, nz int) { lazy_pbc = []int{nx, ny, nz} if lazy_gridsize != nil && lazy_cellsize != nil { SetMesh(lazy_gridsize[X], lazy_gridsize[Y], lazy_gridsize[Z], lazy_cellsize[X], lazy_cellsize[Y], lazy_cellsize[Z], lazy_pbc[X], lazy_pbc[Y], lazy_pbc[Z]) } } // check if mesh is set func checkMesh() { if globalmesh_.Size() == [3]int{0, 0, 0} { panic("need to set mesh first") } } 3-3.11.1/engine/mfm.go000066400000000000000000000020311503346766200143220ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( MFM = NewScalarField("MFM", "arb.", "MFM image", SetMFM) MFMLift inputValue MFMTipSize inputValue mfmconv_ *cuda.MFMConvolution ) func init() { MFMLift = numParam(50e-9, "MFMLift", "m", reinitmfmconv) MFMTipSize = numParam(1e-3, "MFMDipole", "m", reinitmfmconv) DeclLValue("MFMLift", &MFMLift, "MFM lift height") DeclLValue("MFMDipole", &MFMTipSize, "Height of vertically magnetized part of MFM tip") } func SetMFM(dst *data.Slice) { buf := cuda.Buffer(3, Mesh().Size()) defer cuda.Recycle(buf) if mfmconv_ == nil { reinitmfmconv() } msat := Msat.MSlice() defer msat.Recycle() mfmconv_.Exec(buf, M.Buffer(), geometry.Gpu(), msat) cuda.Madd3(dst, buf.Comp(0), buf.Comp(1), buf.Comp(2), 1, 1, 1) } func reinitmfmconv() { SetBusy(true) defer SetBusy(false) if mfmconv_ == nil { mfmconv_ = cuda.NewMFM(Mesh(), MFMLift.v, MFMTipSize.v, *Flag_cachedir) } else { mfmconv_.Reinit(MFMLift.v, MFMTipSize.v, *Flag_cachedir) } } 3-3.11.1/engine/minimizer.go000066400000000000000000000064331503346766200155600ustar00rootroot00000000000000package engine // Minimize follows the steepest descent method as per Exl et al., JAP 115, 17D118 (2014). import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( DmSamples int = 10 // number of dm to keep for convergence check StopMaxDm float64 = 1e-6 // stop minimizer if sampled dm is smaller than this ) func init() { DeclFunc("Minimize", Minimize, "Use steepest conjugate gradient method to minimize the total energy") DeclVar("MinimizerStop", &StopMaxDm, "Stopping max dM for Minimize") DeclVar("MinimizerSamples", &DmSamples, "Number of max dM to collect for Minimize convergence check.") } // fixed length FIFO. Items can be added but not removed type fifoRing struct { count int tail int // index to put next item. Will loop to 0 after exceeding length data []float64 } func FifoRing(length int) fifoRing { return fifoRing{data: make([]float64, length)} } func (r *fifoRing) Add(item float64) { r.data[r.tail] = item r.count++ r.tail = (r.tail + 1) % len(r.data) if r.count > len(r.data) { r.count = len(r.data) } } func (r *fifoRing) Max() float64 { max := r.data[0] for i := 1; i < r.count; i++ { if r.data[i] > max { max = r.data[i] } } return max } type Minimizer struct { k *data.Slice // torque saved to calculate time step lastDm fifoRing h float32 } func (mini *Minimizer) Step() { m := M.Buffer() size := m.Size() if mini.k == nil { mini.k = cuda.Buffer(3, size) torqueFn(mini.k) } k := mini.k h := mini.h // save original magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) // make descent cuda.Minimize(m, m0, k, h) // calculate new torque for next step k0 := cuda.Buffer(3, size) defer cuda.Recycle(k0) data.Copy(k0, k) torqueFn(k) setMaxTorque(k) // report to user // just to make the following readable dm := m0 dk := k0 // calculate step difference of m and k cuda.Madd2(dm, m, m0, 1., -1.) cuda.Madd2(dk, k, k0, -1., 1.) // reversed due to LLNoPrecess sign // get maxdiff and add to list max_dm := cuda.MaxVecNorm(dm) mini.lastDm.Add(max_dm) setLastErr(mini.lastDm.Max()) // report maxDm to user as LastErr // adjust next time step var nom, div float32 if NSteps%2 == 0 { nom = cuda.Dot(dm, dm) div = cuda.Dot(dm, dk) } else { nom = cuda.Dot(dm, dk) div = cuda.Dot(dk, dk) } if div != 0. { mini.h = nom / div } else { // in case of division by zero mini.h = 1e-4 } M.normalize() // as a convention, time does not advance during relax NSteps++ } func (mini *Minimizer) Free() { mini.k.Free() } func Minimize() { Refer("exl2014") SanityCheck() // Save the settings we are changing... prevType := solvertype prevFixDt := FixDt prevPrecess := Precess t0 := Time relaxing = true // disable temperature noise // ...to restore them later defer func() { SetSolver(prevType) FixDt = prevFixDt Precess = prevPrecess Time = t0 relaxing = false }() Precess = false // disable precession for torque calculation // remove previous stepper if stepper != nil { stepper.Free() } // set stepper to the minimizer mini := Minimizer{ h: 1e-4, k: nil, lastDm: FifoRing(DmSamples)} stepper = &mini cond := func() bool { return (mini.lastDm.count < DmSamples || mini.lastDm.Max() > StopMaxDm) } RunWhile(cond) pause = true } 3-3.11.1/engine/number.go000066400000000000000000000016031503346766200150370ustar00rootroot00000000000000package engine import ( "reflect" ) // TODO: wrap around outputValue // inputValue is like outputValue, but settable type inputValue struct { v float64 onSet func() name, unit string } func numParam(v float64, name, unit string, onSet func()) inputValue { return inputValue{v: v, onSet: onSet, name: name, unit: unit} } func (p *inputValue) NComp() int { return 1 } func (p *inputValue) Name() string { return p.name } func (p *inputValue) Unit() string { return p.unit } func (p *inputValue) getRegion(int) []float64 { return []float64{float64(p.v)} } func (p *inputValue) Type() reflect.Type { return reflect.TypeOf(float64(0)) } func (p *inputValue) IsUniform() bool { return true } func (p *inputValue) Eval() interface{} { return p.v } func (p *inputValue) SetValue(v interface{}) { p.v = v.(float64) p.onSet() } 3-3.11.1/engine/od.go000066400000000000000000000014461503346766200141560ustar00rootroot00000000000000package engine // Management of output directory. import ( "github.com/mumax/3/httpfs" "strings" ) var ( outputdir string // Output directory InputFile string ) func OD() string { if outputdir == "" { panic("output not yet initialized") } return outputdir } // SetOD sets the output directory where auto-saved files will be stored. // The -o flag can also be used for this purpose. func InitIO(inputfile, od string, force bool) { if outputdir != "" { panic("output directory already set") } InputFile = inputfile if !strings.HasSuffix(od, "/") { od += "/" } outputdir = od if strings.HasPrefix(outputdir, "http://") { httpfs.SetWD(outputdir + "/../") } LogOut("output directory:", outputdir) if force { httpfs.Remove(od) } _ = httpfs.Mkdir(od) initLog() initBib() } 3-3.11.1/engine/oneregion.go000066400000000000000000000035761503346766200155470ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func sInRegion(q Quantity, r int) ScalarField { return AsScalarField(inRegion(q, r)) } func vInRegion(q Quantity, r int) VectorField { return AsVectorField(inRegion(q, r)) } func sOneRegion(q Quantity, r int) *sOneReg { util.Argument(q.NComp() == 1) return &sOneReg{oneReg{q, r}} } func vOneRegion(q Quantity, r int) *vOneReg { util.Argument(q.NComp() == 3) return &vOneReg{oneReg{q, r}} } type sOneReg struct{ oneReg } func (q *sOneReg) Average() float64 { return q.average()[0] } type vOneReg struct{ oneReg } func (q *vOneReg) Average() data.Vector { return unslice(q.average()) } // represents a new quantity equal to q in the given region, 0 outside. type oneReg struct { parent Quantity region int } func inRegion(q Quantity, region int) Quantity { return &oneReg{q, region} } func (q *oneReg) NComp() int { return q.parent.NComp() } func (q *oneReg) Name() string { return fmt.Sprint(NameOf(q.parent), ".region", q.region) } func (q *oneReg) Unit() string { return UnitOf(q.parent) } func (q *oneReg) Mesh() *data.Mesh { return MeshOf(q.parent) } func (q *oneReg) EvalTo(dst *data.Slice) { EvalTo(q, dst) } // returns a new slice equal to q in the given region, 0 outside. func (q *oneReg) Slice() (*data.Slice, bool) { src := ValueOf(q.parent) defer cuda.Recycle(src) out := cuda.Buffer(q.NComp(), q.Mesh().Size()) cuda.RegionSelect(out, src, regions.Gpu(), byte(q.region)) return out, true } func (q *oneReg) average() []float64 { slice, r := q.Slice() if r { defer cuda.Recycle(slice) } avg := sAverageUniverse(slice) sDiv(avg, regions.volume(q.region)) return avg } func (q *oneReg) Average() []float64 { return q.average() } // slice division func sDiv(v []float64, x float64) { for i := range v { v[i] /= x } } 3-3.11.1/engine/outputquantities.go000066400000000000000000000131101503346766200172120ustar00rootroot00000000000000package engine /* The metadata layer wraps basic micromagnetic functions (e.g. func SetDemagField()) in objects that provide: - additional information (Name, Unit, ...) used for saving output, - additional methods (Comp, Region, ...) handy for input scripting. */ import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // The Info interface defines the bare minimum methods a quantity must implement // to be accessible for scripting and I/O. type Info interface { Name() string // number of components (scalar, vector, ...) Unit() string // name used for output file (e.g. "m") NComp() int // unit, e.g. "A/m" } // info provides an Info implementation intended for embedding in other types. type info struct { nComp int name string unit string } func (i *info) Name() string { return i.name } func (i *info) Unit() string { return i.unit } func (i *info) NComp() int { return i.nComp } // valueFunc is an outputValue implementation where a function provides the output value. // It can be scalar or vector. // Used internally by NewScalarValue and NewVectorValue. type valueFunc struct { info f func() []float64 } func (g *valueFunc) get() []float64 { return g.f() } func (g *valueFunc) average() []float64 { return g.get() } func (g *valueFunc) EvalTo(dst *data.Slice) { v := g.get() for c, v := range v { cuda.Memset(dst.Comp(c), float32(v)) } } // ScalarValue enhances an outputValue with methods specific to // a space-independent scalar quantity (e.g. total energy). type ScalarValue struct { *valueFunc } // NewScalarValue constructs an outputable space-independent scalar quantity whose // value is provided by function f. func NewScalarValue(name, unit, desc string, f func() float64) *ScalarValue { g := func() []float64 { return []float64{f()} } v := &ScalarValue{&valueFunc{info{1, name, unit}, g}} Export(v, desc) return v } func (s ScalarValue) Get() float64 { return s.average()[0] } func (s ScalarValue) Average() float64 { return s.Get() } // VectorValue enhances an outputValue with methods specific to // a space-independent vector quantity (e.g. averaged magnetization). type VectorValue struct { *valueFunc } // NewVectorValue constructs an outputable space-independent vector quantity whose // value is provided by function f. func NewVectorValue(name, unit, desc string, f func() []float64) *VectorValue { v := &VectorValue{&valueFunc{info{3, name, unit}, f}} Export(v, desc) return v } func (v *VectorValue) Get() data.Vector { return unslice(v.average()) } func (v *VectorValue) Average() data.Vector { return v.Get() } // NewVectorField constructs an outputable space-dependent vector quantity whose // value is provided by function f. func NewVectorField(name, unit, desc string, f func(dst *data.Slice)) VectorField { v := AsVectorField(&fieldFunc{info{3, name, unit}, f}) DeclROnly(name, v, cat(desc, unit)) return v } // NewScalarField constructs an outputable space-dependent scalar quantity whose // value is provided by function f. func NewScalarField(name, unit, desc string, f func(dst *data.Slice)) ScalarField { q := AsScalarField(&fieldFunc{info{1, name, unit}, f}) DeclROnly(name, q, cat(desc, unit)) return q } type fieldFunc struct { info f func(*data.Slice) } func (c *fieldFunc) Mesh() *data.Mesh { return Mesh() } func (c *fieldFunc) average() []float64 { return qAverageUniverse(c) } func (c *fieldFunc) EvalTo(dst *data.Slice) { EvalTo(c, dst) } // Calculates and returns the quantity. // recycle is true: slice needs to be recycled. func (q *fieldFunc) Slice() (s *data.Slice, recycle bool) { buf := cuda.Buffer(q.NComp(), q.Mesh().Size()) cuda.Zero(buf) q.f(buf) return buf, true } // ScalarField enhances an outputField with methods specific to // a space-dependent scalar quantity. type ScalarField struct { Quantity } // AsScalarField promotes a quantity to a ScalarField, // enabling convenience methods particular to scalars. func AsScalarField(q Quantity) ScalarField { if q.NComp() != 1 { panic(fmt.Errorf("ScalarField(%v): need 1 component, have: %v", NameOf(q), q.NComp())) } return ScalarField{q} } func (s ScalarField) average() []float64 { return AverageOf(s.Quantity) } func (s ScalarField) Average() float64 { return s.average()[0] } func (s ScalarField) Region(r int) ScalarField { return AsScalarField(inRegion(s.Quantity, r)) } func (s ScalarField) Name() string { return NameOf(s.Quantity) } func (s ScalarField) Unit() string { return UnitOf(s.Quantity) } // VectorField enhances an outputField with methods specific to // a space-dependent vector quantity. type VectorField struct { Quantity } // AsVectorField promotes a quantity to a VectorField, // enabling convenience methods particular to vectors. func AsVectorField(q Quantity) VectorField { if q.NComp() != 3 { panic(fmt.Errorf("VectorField(%v): need 3 components, have: %v", NameOf(q), q.NComp())) } return VectorField{q} } func (v VectorField) average() []float64 { return AverageOf(v.Quantity) } func (v VectorField) Average() data.Vector { return unslice(v.average()) } func (v VectorField) Region(r int) VectorField { return AsVectorField(inRegion(v.Quantity, r)) } func (v VectorField) Comp(c int) ScalarField { return AsScalarField(Comp(v.Quantity, c)) } func (v VectorField) Mesh() *data.Mesh { return MeshOf(v.Quantity) } func (v VectorField) Name() string { return NameOf(v.Quantity) } func (v VectorField) Unit() string { return UnitOf(v.Quantity) } func (v VectorField) HostCopy() *data.Slice { s := ValueOf(v.Quantity) defer cuda.Recycle(s) return s.HostCopy() } 3-3.11.1/engine/parameter.go000066400000000000000000000221421503346766200155300ustar00rootroot00000000000000package engine /* parameters are region- and time dependent input values, like material parameters. */ import ( "fmt" "math" "reflect" "strings" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/script" "github.com/mumax/3/util" ) // input parameter, settable by user type regionwise struct { lut upd_reg [NREGION]func() []float64 // time-dependent values timestamp float64 // used not to double-evaluate f(t) children []derived // derived parameters name, unit string } func (p *regionwise) init(nComp int, name, unit string, children []derived) { p.lut.init(nComp, p) p.name = name p.unit = unit p.children = children p.timestamp = math.Inf(-1) } func (p *regionwise) MSlice() cuda.MSlice { if p.IsUniform() { return cuda.MakeMSlice(data.NilSlice(p.NComp(), Mesh().Size()), p.getRegion(0)) } else { buf, r := p.Slice() util.Assert(r == true) return cuda.ToMSlice(buf) } } func (p *regionwise) Name() string { return p.name } func (p *regionwise) Unit() string { return p.unit } func (p *regionwise) Mesh() *data.Mesh { return Mesh() } func (p *regionwise) addChild(c ...derived) { for _, c := range c { // TODO: no duplicates if !contains(p.children, c) { p.children = append(p.children, c) fmt.Println(p, ".addChild", c) } } } func contains(s []derived, x derived) bool { for _, y := range s { if y == x { return true } } return false } func (p *regionwise) update() { if p.timestamp != Time { changed := false // update functions of time for r := 0; r < NREGION; r++ { updFunc := p.upd_reg[r] if updFunc != nil { p.bufset_(r, updFunc()) changed = true } } p.timestamp = Time if changed { p.invalidate() } } } // set in one region func (p *regionwise) setRegion(region int, v []float64) { if region == -1 { p.setUniform(v) } else { p.setRegions(region, region+1, v) } } // set in all regions func (p *regionwise) setUniform(v []float64) { p.setRegions(0, NREGION, v) } // set in regions r1..r2(excl) func (p *regionwise) setRegions(r1, r2 int, v []float64) { util.Argument(len(v) == len(p.cpu_buf)) util.Argument(r1 < r2) // exclusive upper bound for r := r1; r < r2; r++ { p.upd_reg[r] = nil p.bufset_(r, v) } p.invalidate() } func (p *regionwise) bufset_(region int, v []float64) { for c := range p.cpu_buf { p.cpu_buf[c][region] = float32(v[c]) } } func (p *regionwise) setFunc(r1, r2 int, f func() []float64) { util.Argument(r1 < r2) // exclusive upper bound for r := r1; r < r2; r++ { p.upd_reg[r] = f } p.invalidate() } // mark my GPU copy and my children as invalid (need update) func (p *regionwise) invalidate() { p.gpu_ok = false for _, c := range p.children { c.invalidate() } } func (p *regionwise) getRegion(region int) []float64 { cpu := p.cpuLUT() v := make([]float64, p.NComp()) for i := range v { v[i] = float64(cpu[i][region]) } return v } func (p *regionwise) IsUniform() bool { cpu := p.cpuLUT() v1 := p.getRegion(0) for r := 1; r < NREGION; r++ { for c := range v1 { if cpu[c][r] != float32(v1[c]) { return false } } } return true } func (p *regionwise) average() []float64 { return qAverageUniverse(p) } // parameter derived from others (not directly settable). E.g.: Bsat derived from Msat type DerivedParam struct { lut // GPU storage updater func(*DerivedParam) // called to update my value uptodate bool // cleared if parents' value changes parents []updater // parents updated before I'm updated } // any parameter that depends on an inputParam type derived interface { invalidate() } type parent interface { update() addChild(...derived) } func NewDerivedParam(nComp int, parents []parent, updater func(*DerivedParam)) *DerivedParam { p := new(DerivedParam) p.lut.init(nComp, p) // pass myself to update me if needed p.updater = updater for _, P := range parents { p.parents = append(p.parents, P) } return p } func (d *DerivedParam) init(nComp int, parents []parent, updater func(*DerivedParam)) { d.lut.init(nComp, d) // pass myself to update me if needed d.updater = updater for _, p := range parents { d.parents = append(d.parents, p) p.addChild(d) } } func (p *DerivedParam) invalidate() { p.uptodate = false } func (p *DerivedParam) update() { for _, par := range p.parents { par.update() // may invalidate me } if !p.uptodate { p.updater(p) p.gpu_ok = false p.uptodate = true } } // Get value in region r. func (p *DerivedParam) GetRegion(r int) []float64 { lut := p.cpuLUT() // updates me if needed v := make([]float64, p.NComp()) for c := range v { v[c] = float64(lut[c][r]) } return v } // specialized param with 1 component type RegionwiseScalar struct { regionwise } func (p *RegionwiseScalar) init(name, unit, desc string, children []derived) { p.regionwise.init(SCALAR, name, unit, children) if !strings.HasPrefix(name, "_") { // don't export names beginning with "_" (e.g. from exciation) DeclLValue(name, p, cat(desc, unit)) } } // TODO: auto derived func NewScalarParam(name, unit, desc string, children ...derived) *RegionwiseScalar { p := new(RegionwiseScalar) p.regionwise.init(SCALAR, name, unit, children) if !strings.HasPrefix(name, "_") { // don't export names beginning with "_" (e.g. from exciation) DeclLValue(name, p, cat(desc, unit)) } return p } func (p *RegionwiseScalar) SetRegion(region int, f script.ScalarFunction) { if region == -1 { p.setRegionsFunc(0, NREGION, f) // uniform } else { p.setRegionsFunc(region, region+1, f) // upper bound exclusive } } func (p *RegionwiseScalar) SetValue(v interface{}) { f := v.(script.ScalarFunction) p.setRegionsFunc(0, NREGION, f) } func (p *RegionwiseScalar) Set(v float64) { p.setRegions(0, NREGION, []float64{v}) } func (p *RegionwiseScalar) setRegionsFunc(r1, r2 int, f script.ScalarFunction) { if IsConst(f) { p.setRegions(r1, r2, []float64{f.Float()}) } else { f := f.Fix() // fix values of all variables except t p.setFunc(r1, r2, func() []float64 { return []float64{f.Eval().(script.ScalarFunction).Float()} }) } } func (p *RegionwiseScalar) GetRegion(region int) float64 { return float64(p.getRegion(region)[0]) } func (p *RegionwiseScalar) Eval() interface{} { return p } func (p *RegionwiseScalar) Type() reflect.Type { return reflect.TypeOf(new(RegionwiseScalar)) } func (p *RegionwiseScalar) InputType() reflect.Type { return script.ScalarFunction_t } func (p *RegionwiseScalar) Average() float64 { return qAverageUniverse(p)[0] } func (p *RegionwiseScalar) Region(r int) *sOneReg { return sOneRegion(p, r) } // checks if a script expression contains t (time) func IsConst(e script.Expr) bool { t := World.Resolve("t") return !script.Contains(e, t) } func cat(desc, unit string) string { if unit == "" { return desc } else { return desc + " (" + unit + ")" } } // these methods should only be accesible from Go func (p *RegionwiseScalar) SetRegionValueGo(region int, v float64) { if region == -1 { p.setRegions(0, NREGION, []float64{v}) } else { p.setRegions(region, region+1, []float64{v}) } } func (p *RegionwiseScalar) SetRegionFuncGo(region int, f func() float64) { if region == -1 { p.setFunc(0, NREGION, func() []float64 { return []float64{f()} }) } else { p.setFunc(region, region+1, func() []float64 { return []float64{f()} }) } } // vector input parameter, settable by user type RegionwiseVector struct { regionwise } func NewVectorParam(name, unit, desc string) *RegionwiseVector { p := new(RegionwiseVector) p.regionwise.init(VECTOR, name, unit, nil) // no vec param has children (yet) if !strings.HasPrefix(name, "_") { // don't export names beginning with "_" (e.g. from exciation) DeclLValue(name, p, cat(desc, unit)) } return p } func (p *RegionwiseVector) SetRegion(region int, f script.VectorFunction) { if region == -1 { p.setRegionsFunc(0, NREGION, f) //uniform } else { p.setRegionsFunc(region, region+1, f) } } func (p *RegionwiseVector) SetValue(v interface{}) { f := v.(script.VectorFunction) p.setRegionsFunc(0, NREGION, f) } func (p *RegionwiseVector) setRegionsFunc(r1, r2 int, f script.VectorFunction) { if IsConst(f) { p.setRegions(r1, r2, slice(f.Float3())) } else { f := f.Fix() // fix values of all variables except t p.setFunc(r1, r2, func() []float64 { return slice(f.Eval().(script.VectorFunction).Float3()) }) } } func (p *RegionwiseVector) SetRegionFn(region int, f func() [3]float64) { p.setFunc(region, region+1, func() []float64 { return slice(f()) }) } func (p *RegionwiseVector) GetRegion(region int) [3]float64 { v := p.getRegion(region) return unslice(v) } func (p *RegionwiseVector) Eval() interface{} { return p } func (p *RegionwiseVector) Type() reflect.Type { return reflect.TypeOf(new(RegionwiseVector)) } func (p *RegionwiseVector) InputType() reflect.Type { return script.VectorFunction_t } func (p *RegionwiseVector) Region(r int) *vOneReg { return vOneRegion(p, r) } func (p *RegionwiseVector) Average() data.Vector { return unslice(qAverageUniverse(p)) } func (p *RegionwiseVector) Comp(c int) ScalarField { return Comp(p, c) } 3-3.11.1/engine/plot.go000066400000000000000000000035461503346766200145350ustar00rootroot00000000000000package engine import ( "bytes" "errors" "fmt" "image" "image/png" "io" "net/http" "os/exec" "sync/atomic" "github.com/mumax/3/httpfs" ) var nPlots int32 // counts number of active gnuplot processes const MAX_GNUPLOTS = 5 // maximum allowed number of gnuplot processes func (g *guistate) servePlot(w http.ResponseWriter, r *http.Request) { out := []byte{} // handle error and return wheter err != nil. handle := func(err error) bool { if err != nil { w.Write(emptyIMG()) g.Set("plotErr", err.Error()+string(out)) return true } else { return false } } // limit max processes atomic.AddInt32(&nPlots, 1) defer atomic.AddInt32(&nPlots, -1) if atomic.LoadInt32(&nPlots) > MAX_GNUPLOTS { handle(errors.New("too many gnuplot processes")) return } a := g.StringValue("usingx") b := g.StringValue("usingy") cmd := "gnuplot" args := []string{"-e", fmt.Sprintf(`set format x "%%g"; set key off; set format y "%%g"; set term svg size 480,320 font 'Arial,10'; plot "-" u %v:%v w li; set output;exit;`, a, b)} excmd := exec.Command(cmd, args...) stdin, err := excmd.StdinPipe() if handle(err) { return } stdout, err := excmd.StdoutPipe() if handle(err) { return } data, err := httpfs.Read(fmt.Sprintf(`%vtable.txt`, OD())) if handle(err) { return } err = excmd.Start() if handle(err) { return } defer excmd.Wait() _, err = stdin.Write(data) if handle(err) { return } err = stdin.Close() if handle(err) { return } out, err = io.ReadAll(stdout) if handle(err) { return } w.Header().Set("Content-Type", "image/svg+xml") w.Write(out) g.Set("plotErr", "") } var empty_img []byte // empty image to show if there's no plot... func emptyIMG() []byte { if empty_img == nil { o := bytes.NewBuffer(nil) png.Encode(o, image.NewNRGBA(image.Rect(0, 0, 4, 4))) empty_img = o.Bytes() } return empty_img } 3-3.11.1/engine/quantity.go000066400000000000000000000032131503346766200154240ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "reflect" ) // Arbitrary physical quantity. type Quantity interface { NComp() int EvalTo(dst *data.Slice) } func MeshSize() [3]int { return Mesh().Size() } func SizeOf(q Quantity) [3]int { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Mesh() *data.Mesh }); ok { return s.Mesh().Size() } // otherwise: default mesh return MeshSize() } func AverageOf(q Quantity) []float64 { // quantity defines its own, custom, implementation: if s, ok := q.(interface { average() []float64 }); ok { return s.average() } // otherwise: default mesh buf := ValueOf(q) defer cuda.Recycle(buf) return sAverageMagnet(buf) } func NameOf(q Quantity) string { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Name() string }); ok { return s.Name() } return "unnamed." + reflect.TypeOf(q).String() } func UnitOf(q Quantity) string { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Unit() string }); ok { return s.Unit() } return "?" } func MeshOf(q Quantity) *data.Mesh { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Mesh() *data.Mesh }); ok { return s.Mesh() } return Mesh() } func ValueOf(q Quantity) *data.Slice { // TODO: check for Buffered() implementation buf := cuda.Buffer(q.NComp(), SizeOf(q)) q.EvalTo(buf) return buf } // Temporary shim to fit Slice into EvalTo func EvalTo(q interface { Slice() (*data.Slice, bool) }, dst *data.Slice) { v, r := q.Slice() if r { defer cuda.Recycle(v) } data.Copy(dst, v) } 3-3.11.1/engine/regions.go000066400000000000000000000171701503346766200152230ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var regions = Regions{info: info{1, "regions", ""}} // global regions map const NREGION = 256 // maximum number of regions, limited by size of byte. func init() { DeclFunc("DefRegion", DefRegion, "Define a material region with given index (0-255) and shape") DeclFunc("RedefRegion", RedefRegion, "Reassign all cells with a given region (first argument) to a new region (second argument)") DeclROnly("regions", ®ions, "Outputs the region index for each cell") DeclFunc("DefRegionCell", DefRegionCell, "Set a material region (first argument) in one cell "+ "by the index of the cell (last three arguments)") } // stores the region index for each cell type Regions struct { gpuCache *cuda.Bytes // TODO: rename: buffer hist []func(x, y, z float64) int // history of region set operations info } func (r *Regions) alloc() { mesh := r.Mesh() r.gpuCache = cuda.NewBytes(mesh.NCell()) DefRegion(0, universe) } func (r *Regions) resize() { newSize := Mesh().Size() r.gpuCache.Free() r.gpuCache = cuda.NewBytes(prod(newSize)) for _, f := range r.hist { r.render(f) } } // Define a region with id (0-255) to be inside the Shape. func DefRegion(id int, s Shape) { defRegionId(id) f := func(x, y, z float64) int { if s(x, y, z) { return id } else { return -1 } } regions.render(f) regions.hist = append(regions.hist, f) } // Redefine a region with a given ID to a new ID func RedefRegion(startId, endId int) { // Checks validity of input region IDs defRegionId(startId) defRegionId(endId) hist_len := len(regions.hist) // Only consider hist before this Redef to avoid recursion f := func(x, y, z float64) int { value := -1 for i := hist_len - 1; i >= 0; i-- { f_other := regions.hist[i] region := f_other(x, y, z) if region >= 0 { value = region break } } if value == startId { return endId } else { return value } } regions.redefine(startId, endId) regions.hist = append(regions.hist, f) } // renders (rasterizes) shape, filling it with region number #id, between x1 and x2 // TODO: a tidbit expensive func (r *Regions) render(f func(x, y, z float64) int) { n := Mesh().Size() l := r.HostList() // need to start from previous state arr := reshapeBytes(l, r.Mesh().Size()) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { r := Index2Coord(ix, iy, iz) region := f(r[X], r[Y], r[Z]) if region >= 0 { arr[iz][iy][ix] = byte(region) } } } } //log.Print("regions.upload") r.gpuCache.Upload(l) } func (r *Regions) redefine(startId, endId int) { // Loop through all cells, if their region ID matches startId, change it to endId n := Mesh().Size() l := r.HostList() // need to start from previous state arr := reshapeBytes(l, r.Mesh().Size()) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { if arr[iz][iy][ix] == byte(startId) { arr[iz][iy][ix] = byte(endId) } } } } r.gpuCache.Upload(l) } // get the region for position R based on the history func (r *Regions) get(R data.Vector) int { // reverse order, last one set wins. for i := len(r.hist) - 1; i >= 0; i-- { f := r.hist[i] region := f(R[X], R[Y], R[Z]) if region >= 0 { return region } } return 0 } func (r *Regions) HostArray() [][][]byte { return reshapeBytes(r.HostList(), r.Mesh().Size()) } func (r *Regions) HostList() []byte { regionsList := make([]byte, r.Mesh().NCell()) regions.gpuCache.Download(regionsList) return regionsList } func DefRegionCell(id int, x, y, z int) { defRegionId(id) index := data.Index(Mesh().Size(), x, y, z) regions.gpuCache.Set(index, byte(id)) } // Load regions from ovf file, use first component. // Regions should be between 0 and 256 func (r *Regions) LoadFile(fname string) { inSlice := LoadFile(fname) n := r.Mesh().Size() inSlice = data.Resample(inSlice, n) inArr := inSlice.Tensors()[0] l := r.HostList() arr := reshapeBytes(l, n) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { val := inArr[iz][iy][ix] if val < 0 || val > 256 { util.Fatal("regions.LoadFile(", fname, "): all values should be between 0 & 256, have: ", val) } arr[iz][iy][ix] = byte(val) } } } r.gpuCache.Upload(l) } func (r *Regions) average() []float64 { s, recycle := r.Slice() if recycle { defer cuda.Recycle(s) } return sAverageUniverse(s) } func (r *Regions) Average() float64 { return r.average()[0] } // Set the region of one cell func (r *Regions) SetCell(ix, iy, iz int, region int) { size := Mesh().Size() i := data.Index(size, ix, iy, iz) r.gpuCache.Set(i, byte(region)) } func (r *Regions) GetCell(ix, iy, iz int) int { size := Mesh().Size() i := data.Index(size, ix, iy, iz) return int(r.gpuCache.Get(i)) } func defRegionId(id int) { if id < 0 || id > NREGION { util.Fatalf("region id should be 0 -%v, have: %v", NREGION, id) } checkMesh() } // normalized volume (0..1) of region. // TODO: a tidbit too expensive func (r *Regions) volume(region_ int) float64 { region := byte(region_) vol := 0 list := r.HostList() for _, reg := range list { if reg == region { vol++ } } V := float64(vol) / float64(r.Mesh().NCell()) return V } // Get the region data on GPU func (r *Regions) Gpu() *cuda.Bytes { return r.gpuCache } var unitMap regionwise // unit map used to output regions quantity func init() { unitMap.init(1, "unit", "", nil) for r := 0; r < NREGION; r++ { unitMap.setRegion(r, []float64{float64(r)}) } } // Get returns the regions as a slice of floats, so it can be output. func (r *Regions) Slice() (*data.Slice, bool) { buf := cuda.Buffer(1, r.Mesh().Size()) cuda.RegionDecode(buf, unitMap.gpuLUT1(), regions.Gpu()) return buf, true } func (r *Regions) EvalTo(dst *data.Slice) { EvalTo(r, dst) } var _ Quantity = ®ions // Re-interpret a contiguous array as a multi-dimensional array of given size. func reshapeBytes(array []byte, size [3]int) [][][]byte { Nx, Ny, Nz := size[X], size[Y], size[Z] util.Argument(Nx*Ny*Nz == len(array)) sliced := make([][][]byte, Nz) for i := range sliced { sliced[i] = make([][]byte, Ny) } for i := range sliced { for j := range sliced[i] { sliced[i][j] = array[(i*Ny+j)*Nx+0 : (i*Ny+j)*Nx+Nx] } } return sliced } func (b *Regions) shift(dx int) { // TODO: return if no regions defined r1 := b.Gpu() r2 := cuda.NewBytes(b.Mesh().NCell()) // TODO: somehow recycle defer r2.Free() newreg := byte(0) // new region at edge cuda.ShiftBytes(r2, r1, b.Mesh(), dx, newreg) r1.Copy(r2) n := Mesh().Size() x1, x2 := shiftDirtyRange(dx, X) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := x1; ix < x2; ix++ { r := Index2Coord(ix, iy, iz) // includes shift reg := b.get(r) if reg != 0 { b.SetCell(ix, iy, iz, reg) // a bit slowish, but hardly reached } } } } } func (b *Regions) shiftY(dy int) { // TODO: return if no regions defined r1 := b.Gpu() r2 := cuda.NewBytes(b.Mesh().NCell()) // TODO: somehow recycle defer r2.Free() newreg := byte(0) // new region at edge cuda.ShiftBytesY(r2, r1, b.Mesh(), dy, newreg) r1.Copy(r2) n := Mesh().Size() y1, y2 := shiftDirtyRange(dy, Y) for iz := 0; iz < n[Z]; iz++ { for ix := 0; ix < n[X]; ix++ { for iy := y1; iy < y2; iy++ { r := Index2Coord(ix, iy, iz) // includes shift reg := b.get(r) if reg != 0 { b.SetCell(ix, iy, iz, reg) // a bit slowish, but hardly reached } } } } } func (r *Regions) Mesh() *data.Mesh { return Mesh() } func prod(s [3]int) int { return s[0] * s[1] * s[2] } 3-3.11.1/engine/relax.go000066400000000000000000000055751503346766200146760ustar00rootroot00000000000000package engine // Relax tries to find the minimum energy state. import ( "math" "github.com/mumax/3/cuda" ) // Stopping relax Maxtorque in T. The user can check MaxTorque for sane values (e.g. 1e-3). // If set to <=0, relax() will stop when the average torque is steady or increasing. var RelaxTorqueThreshold float64 = -1. func init() { DeclFunc("Relax", Relax, "Try to minimize the total energy") DeclVar("RelaxTorqueThreshold", &RelaxTorqueThreshold, "MaxTorque threshold for relax(). If set to -1 (default), relax() will stop when the average torque is steady or increasing.") } // are we relaxing? var relaxing = false func Relax() { SanityCheck() pause = false // Save the settings we are changing... prevType := solvertype prevErr := MaxErr prevFixDt := FixDt prevPrecess := Precess // ...to restore them later defer func() { SetSolver(prevType) MaxErr = prevErr FixDt = prevFixDt Precess = prevPrecess relaxing = false // Temp.upd_reg = prevTemp // Temp.invalidate() // Temp.update() }() // Set good solver for relax SetSolver(BOGACKISHAMPINE) FixDt = 0 Precess = false relaxing = true // Minimize energy: take steps as long as energy goes down. // This stops when energy reaches the numerical noise floor. const N = 3 // evaluate energy (expensive) every N steps relaxSteps(N) E0 := GetTotalEnergy() relaxSteps(N) E1 := GetTotalEnergy() for E1 < E0 && !pause { relaxSteps(N) E0, E1 = E1, GetTotalEnergy() } // Now we are already close to equilibrium, but energy is too noisy to be used any further. // So now we minimize the torque which is less noisy. solver := stepper.(*RK23) defer stepper.Free() // purge previous rk.k1 because FSAL will be dead wrong. maxTorque := func() float64 { return cuda.MaxVecNorm(solver.k1) } avgTorque := func() float32 { return cuda.Dot(solver.k1, solver.k1) } if RelaxTorqueThreshold > 0 { // run as long as the max torque is above threshold. Then increase the accuracy and step more. for !pause { for maxTorque() > RelaxTorqueThreshold && !pause { relaxSteps(N) } MaxErr /= math.Sqrt2 if MaxErr < 1e-9 { break } } } else { // previous ( 1e-9 && !pause { MaxErr /= math.Sqrt2 relaxSteps(N) // TODO: Play with other values T0, T1 = T1, avgTorque() for T1 < T0 && !pause { relaxSteps(N) // TODO: Play with other values T0, T1 = T1, avgTorque() } } } pause = true } // take n steps without setting pause when done or advancing time func relaxSteps(n int) { t0 := Time stop := NSteps + n cond := func() bool { return NSteps < stop } const output = false runWhile(cond, output) Time = t0 } 3-3.11.1/engine/render.go000066400000000000000000000063401503346766200150310ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/draw" "image" "image/jpeg" "math" "net/http" "sync" ) type render struct { mutex sync.Mutex quant Quantity comp string layer, scale int saveCount int // previous max slider value of time rescaleBuf *data.Slice // GPU imgBuf *data.Slice // CPU img_ *image.RGBA } const ( maxScale = 32 // maximum zoom-out setting maxImgSize = 512 // maximum render image size ) // Render image of quantity. func (g *guistate) ServeRender(w http.ResponseWriter, r *http.Request) { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.render() jpeg.Encode(w, g.render.img_, &jpeg.Options{Quality: 100}) } // rescale and download quantity, save in rescaleBuf func (ren *render) download() { InjectAndWait(func() { if ren.quant == nil { // not yet set, default = m ren.quant = &M } quant := ren.quant size := MeshOf(quant).Size() // don't slice out of bounds renderLayer := ren.layer if renderLayer >= size[Z] { renderLayer = size[Z] - 1 } if renderLayer < 0 { renderLayer = 0 } // scaling sanity check if ren.scale < 1 { ren.scale = 1 } if ren.scale > maxScale { ren.scale = maxScale } // Don't render too large images or we choke for size[X]/ren.scale > maxImgSize { ren.scale++ } for size[Y]/ren.scale > maxImgSize { ren.scale++ } for i := range size { size[i] /= ren.scale if size[i] == 0 { size[i] = 1 } } size[Z] = 1 // selects one layer // make sure buffers are there if ren.imgBuf.Size() != size { ren.imgBuf = data.NewSlice(3, size) // always 3-comp, may be re-used } buf := ValueOf(quant) defer cuda.Recycle(buf) if !buf.GPUAccess() { ren.imgBuf = Download(quant) // fallback (no zoom) return } // make sure buffers are there (in CUDA context) if ren.rescaleBuf.Size() != size { ren.rescaleBuf.Free() ren.rescaleBuf = cuda.NewSlice(1, size) } for c := 0; c < quant.NComp(); c++ { cuda.Resize(ren.rescaleBuf, buf.Comp(c), renderLayer) data.Copy(ren.imgBuf.Comp(c), ren.rescaleBuf) } }) } var arrowSize = 16 func (ren *render) render() { ren.download() // imgBuf always has 3 components, we may need just one... d := ren.imgBuf comp := ren.comp quant := ren.quant if comp == "" { normalize(d) } if comp != "" && quant.NComp() > 1 { // ... if one has been selected by gui d = d.Comp(compstr[comp]) } if quant.NComp() == 1 { // ...or if the original data only had one (!) d = d.Comp(0) } if ren.img_ == nil { ren.img_ = new(image.RGBA) } draw.On(ren.img_, d, "auto", "auto", arrowSize) } var compstr = map[string]int{"x": 0, "y": 1, "z": 2} func normalize(f *data.Slice) { a := f.Vectors() maxnorm := 0. for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { x, y, z := a[0][i][j][k], a[1][i][j][k], a[2][i][j][k] norm := math.Sqrt(float64(x*x + y*y + z*z)) if norm > maxnorm { maxnorm = norm } } } } factor := float32(1 / maxnorm) for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { a[0][i][j][k] *= factor a[1][i][j][k] *= factor a[2][i][j][k] *= factor } } } } 3-3.11.1/engine/rk23.go000066400000000000000000000050571503346766200143370ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "math" ) // Bogacki-Shampine solver. 3rd order, 3 evaluations per step, adaptive step. // // http://en.wikipedia.org/wiki/Bogacki-Shampine_method // // k1 = f(tn, yn) // k2 = f(tn + 1/2 h, yn + 1/2 h k1) // k3 = f(tn + 3/4 h, yn + 3/4 h k2) // y{n+1} = yn + 2/9 h k1 + 1/3 h k2 + 4/9 h k3 // 3rd order // k4 = f(tn + h, y{n+1}) // z{n+1} = yn + 7/24 h k1 + 1/4 h k2 + 1/3 h k3 + 1/8 h k4 // 2nd order type RK23 struct { k1 *data.Slice // torque at end of step is kept for beginning of next step } func (rk *RK23) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./4.)*Dt_si cuda.Madd2(m, m0, k2, 1, (3./4.)*h) // m = m0*1 + k2*3/4 M.normalize() torqueFn(k3) // 3rd order solution cuda.Madd4(m, m0, rk.k1, k2, k3, 1, (2./9.)*h, (1./3.)*h, (4./9.)*h) M.normalize() // error estimate Time = t0 + Dt_si torqueFn(k4) Err := k2 // re-use k2 as error // difference of 3rd and 2nd order torque without explicitly storing them first cuda.Madd4(Err, rk.k1, k2, k3, k4, (7./24.)-(2./9.), (1./4.)-(1./3.), (1./3.)-(4./9.), (1. / 8.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k4) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./3.)) data.Copy(rk.k1, k4) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./4.)) } } func (rk *RK23) Free() { rk.k1.Free() rk.k1 = nil } 3-3.11.1/engine/rk4.go000066400000000000000000000030341503346766200142470ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Classical 4th order RK solver. type RK4 struct { } func (rk *RK4) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k1, k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k1) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // stage 1 torqueFn(k1) // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 cuda.Madd2(m, m0, k2, 1, (1./2.)*h) // m = m0*1 + k2*1/2 M.normalize() torqueFn(k3) // stage 4 Time = t0 + Dt_si cuda.Madd2(m, m0, k3, 1, 1.*h) // m = m0*1 + k3*1 M.normalize() torqueFn(k4) err := cuda.MaxVecDiff(k1, k4) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK // 4th order solution cuda.Madd5(m, m0, k1, k2, k3, k4, 1, (1./6.)*h, (1./3.)*h, (1./3.)*h, (1./6.)*h) M.normalize() NSteps++ adaptDt(math.Pow(MaxErr/err, 1./4.)) setLastErr(err) setMaxTorque(k4) } else { // undo bad step util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./5.)) } } func (*RK4) Free() {} 3-3.11.1/engine/rk45dp.go000066400000000000000000000057761503346766200146770ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "math" ) type RK45DP struct { k1 *data.Slice // torque at end of step is kept for beginning of next step } func (rk *RK45DP) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with finite temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4, k5, k6 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) defer cuda.Recycle(k5) defer cuda.Recycle(k6) // k2 will be re-used as k7 h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./5.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./5.)*h) // m = m*1 + k1*h/5 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./10.)*Dt_si cuda.Madd3(m, m0, rk.k1, k2, 1, (3./40.)*h, (9./40.)*h) M.normalize() torqueFn(k3) // stage 4 Time = t0 + (4./5.)*Dt_si cuda.Madd4(m, m0, rk.k1, k2, k3, 1, (44./45.)*h, (-56./15.)*h, (32./9.)*h) M.normalize() torqueFn(k4) // stage 5 Time = t0 + (8./9.)*Dt_si cuda.Madd5(m, m0, rk.k1, k2, k3, k4, 1, (19372./6561.)*h, (-25360./2187.)*h, (64448./6561.)*h, (-212./729.)*h) M.normalize() torqueFn(k5) // stage 6 Time = t0 + (1.)*Dt_si cuda.Madd6(m, m0, rk.k1, k2, k3, k4, k5, 1, (9017./3168.)*h, (-355./33.)*h, (46732./5247.)*h, (49./176.)*h, (-5103./18656.)*h) M.normalize() torqueFn(k6) // stage 7: 5th order solution Time = t0 + (1.)*Dt_si // no k2 cuda.Madd6(m, m0, rk.k1, k3, k4, k5, k6, 1, (35./384.)*h, (500./1113.)*h, (125./192.)*h, (-2187./6784.)*h, (11./84.)*h) // 5th M.normalize() k7 := k2 // re-use k2 torqueFn(k7) // next torque if OK // error estimate Err := cuda.Buffer(3, size) //k3 // re-use k3 as error estimate defer cuda.Recycle(Err) cuda.Madd6(Err, rk.k1, k3, k4, k5, k6, k7, (35./384.)-(5179./57600.), (500./1113.)-(7571./16695.), (125./192.)-(393./640.), (-2187./6784.)-(-92097./339200.), (11./84.)-(187./2100.), (0.)-(1./40.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k7) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./5.)) data.Copy(rk.k1, k7) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./6.)) } } func (rk *RK45DP) Free() { rk.k1.Free() rk.k1 = nil } 3-3.11.1/engine/rk56.go000066400000000000000000000056601503346766200143450ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) type RK56 struct { } func (rk *RK56) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k1, k2, k3, k4, k5, k6, k7, k8 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k1) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) defer cuda.Recycle(k5) defer cuda.Recycle(k6) defer cuda.Recycle(k7) defer cuda.Recycle(k8) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // stage 1 torqueFn(k1) // stage 2 Time = t0 + (1./6.)*Dt_si cuda.Madd2(m, m, k1, 1, (1./6.)*h) // m = m*1 + k1*h/6 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (4./15.)*Dt_si cuda.Madd3(m, m0, k1, k2, 1, (4./75.)*h, (16./75.)*h) M.normalize() torqueFn(k3) // stage 4 Time = t0 + (2./3.)*Dt_si cuda.Madd4(m, m0, k1, k2, k3, 1, (5./6.)*h, (-8./3.)*h, (5./2.)*h) M.normalize() torqueFn(k4) // stage 5 Time = t0 + (4./5.)*Dt_si cuda.Madd5(m, m0, k1, k2, k3, k4, 1, (-8./5.)*h, (144./25.)*h, (-4.)*h, (16./25.)*h) M.normalize() torqueFn(k5) // stage 6 Time = t0 + (1.)*Dt_si cuda.Madd6(m, m0, k1, k2, k3, k4, k5, 1, (361./320.)*h, (-18./5.)*h, (407./128.)*h, (-11./80.)*h, (55./128.)*h) M.normalize() torqueFn(k6) // stage 7 Time = t0 cuda.Madd5(m, m0, k1, k3, k4, k5, 1, (-11./640.)*h, (11./256.)*h, (-11/160.)*h, (11./256.)*h) M.normalize() torqueFn(k7) // stage 8 Time = t0 + (1.)*Dt_si cuda.Madd7(m, m0, k1, k2, k3, k4, k5, k7, 1, (93./640.)*h, (-18./5.)*h, (803./256.)*h, (-11./160.)*h, (99./256.)*h, (1.)*h) M.normalize() torqueFn(k8) // stage 9: 6th order solution Time = t0 + (1.)*Dt_si //madd6(m, m0, k1, k3, k4, k5, k6, 1, (31./384.)*h, (1125./2816.)*h, (9./32.)*h, (125./768.)*h, (5./66.)*h) cuda.Madd7(m, m0, k1, k3, k4, k5, k7, k8, 1, (7./1408.)*h, (1125./2816.)*h, (9./32.)*h, (125./768.)*h, (5./66.)*h, (5./66.)*h) M.normalize() // No need for torqueFn(k9) as k9 wouldn't be used (except in setMaxTorque, which is irrelevant) // error estimate Err := cuda.Buffer(3, size) defer cuda.Recycle(Err) cuda.Madd4(Err, k1, k6, k7, k8, (-5. / 66.), (-5. / 66.), (5. / 66.), (5. / 66.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k8) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./6.)) } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./7.)) } } func (rk *RK56) Free() { } 3-3.11.1/engine/run.go000066400000000000000000000152321503346766200143560ustar00rootroot00000000000000package engine import ( "fmt" "math" "os" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Solver globals var ( Time float64 // time in seconds alarm float64 // alarm clock marks end time of run, dt adaptation must not cross it! pause = true // set pause at any time to stop running after the current step postStep []func() // called on after every full time step Inject = make(chan func()) // injects code in between time steps. Used by web interface. Dt_si float64 = 1e-15 // time step = dt_si (seconds) *dt_mul, which should be nice float32 MinDt, MaxDt float64 // minimum and maximum time step MaxErr float64 = 1e-5 // maximum error/step Headroom float64 = 0.8 // solver headroom, (Gustafsson, 1992, Control of Error and Convergence in ODE Solvers) LastErr, PeakErr float64 // error of last step, highest error ever LastTorque float64 // maxTorque of last time step NSteps, NUndone, NEvals int // number of good steps, undone steps FixDt float64 // fixed time step? stepper Stepper // generic step, can be EulerStep, HeunStep, etc solvertype int ) func init() { DeclFunc("Run", Run, "Run the simulation for a time in seconds") DeclFunc("Steps", Steps, "Run the simulation for a number of time steps") DeclFunc("RunWhile", RunWhile, "Run while condition function is true") DeclFunc("SetSolver", SetSolver, "Set solver type.
    1: Euler
    2: Heun
    3: Bogacki-Shampine
    4: Runge-Kutta (RK4)
    5: Dormand-Prince
    6: Fehlberg
    -1: Backward Euler") DeclFunc("ClearPostSteps", func() { postStep = nil }, "Clear the postStep array, which contains functions that are executed after each solver step. This includes running averages, centering routines to track skyrmions and domain walls etc.") DeclTVar("t", &Time, "Total simulated time (s)") DeclVar("step", &NSteps, "Total number of time steps taken") DeclVar("MinDt", &MinDt, "Minimum time step the solver can take (s)") DeclVar("MaxDt", &MaxDt, "Maximum time step the solver can take (s)") DeclVar("MaxErr", &MaxErr, "Maximum error per step the solver can tolerate (default = 1e-5)") DeclVar("Headroom", &Headroom, "Solver headroom (default = 0.8)") DeclVar("FixDt", &FixDt, "Set a fixed time step, 0 disables fixed step (which is the default)") DeclFunc("Exit", Exit, "Exit from the program") SetSolver(DORMANDPRINCE) _ = NewScalarValue("dt", "s", "Time Step", func() float64 { return Dt_si }) _ = NewScalarValue("LastErr", "", "Error of last step", func() float64 { return LastErr }) _ = NewScalarValue("PeakErr", "", "Overall maxium error per step", func() float64 { return PeakErr }) _ = NewScalarValue("NEval", "", "Total number of torque evaluations", func() float64 { return float64(NEvals) }) } // Time stepper like Euler, Heun, RK23 type Stepper interface { Step() // take time step using solver globals Free() // free resources, if any (e.g.: RK23 previous torque) } // Arguments for SetSolver const ( BACKWARD_EULER = -1 EULER = 1 HEUN = 2 BOGACKISHAMPINE = 3 RUNGEKUTTA = 4 DORMANDPRINCE = 5 FEHLBERG = 6 ) func SetSolver(typ int) { // free previous solver, if any if stepper != nil { stepper.Free() } switch typ { default: util.Fatalf("SetSolver: unknown solver type: %v", typ) case BACKWARD_EULER: stepper = new(BackwardEuler) case EULER: stepper = new(Euler) case HEUN: stepper = new(Heun) case BOGACKISHAMPINE: stepper = new(RK23) case RUNGEKUTTA: stepper = new(RK4) case DORMANDPRINCE: stepper = new(RK45DP) case FEHLBERG: stepper = new(RK56) } solvertype = typ } // write torque to dst and increment NEvals func torqueFn(dst *data.Slice) { SetTorque(dst) NEvals++ } // returns number of torque evaluations func getNEval() int { return NEvals } // update lastErr and peakErr func setLastErr(err float64) { LastErr = err if err > PeakErr { PeakErr = err } } func setMaxTorque(τ *data.Slice) { LastTorque = cuda.MaxVecNorm(τ) } // adapt time step: dt *= corr, but limited to sensible values. func adaptDt(corr float64) { if FixDt != 0 { Dt_si = FixDt return } // corner case triggered by err = 0: just keep time step. // see test/regression017.mx3 if math.IsNaN(corr) { corr = 1 } util.AssertMsg(corr != 0, "Time step too small, check if parameters are sensible") corr *= Headroom if corr > 2 { corr = 2 } if corr < 1./2. { corr = 1. / 2. } Dt_si *= corr if MinDt != 0 && Dt_si < MinDt { Dt_si = MinDt } if MaxDt != 0 && Dt_si > MaxDt { Dt_si = MaxDt } if Dt_si == 0 { util.Fatal("time step too small") } // do not cross alarm time if Time < alarm && Time+Dt_si > alarm { Dt_si = alarm - Time } util.AssertMsg(Dt_si > 0, fmt.Sprint("Time step too small: ", Dt_si)) } // Run the simulation for a number of seconds. func Run(seconds float64) { stop := Time + seconds alarm = stop // don't have dt adapt to go over alarm RunWhile(func() bool { return Time < stop }) } // Run the simulation for a number of steps. func Steps(n int) { stop := NSteps + n RunWhile(func() bool { return NSteps < stop }) } // Runs as long as condition returns true, saves output. func RunWhile(condition func() bool) { SanityCheck() pause = false // may be set by <-Inject const output = true stepper.Free() // start from a clean state runWhile(condition, output) pause = true } func runWhile(condition func() bool, output bool) { DoOutput() // allow t=0 output for condition() && !pause { select { default: step(output) // accept tasks form Inject channel case f := <-Inject: f() } } } // Runs as long as browser is connected to gui. func RunInteractive() { gui_.RunInteractive() } // take one time step func step(output bool) { stepper.Step() for _, f := range postStep { f() } if output { DoOutput() } } // Register function f to be called after every time step. // Typically used, e.g., to manipulate the magnetization. func PostStep(f func()) { postStep = append(postStep, f) } // inject code into engine and wait for it to complete. func InjectAndWait(task func()) { ready := make(chan int) Inject <- func() { task(); ready <- 1 } <-ready } func SanityCheck() { if Msat.isZero() { util.Log("Note: Msat = 0") } if Aex.isZero() { util.Log("Note: Aex = 0") } } func Exit() { Close() os.Exit(0) } 3-3.11.1/engine/save.go000066400000000000000000000106701503346766200145110ustar00rootroot00000000000000package engine import ( "fmt" "path" "reflect" "strings" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/draw" "github.com/mumax/3/dump" "github.com/mumax/3/httpfs" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) func init() { DeclFunc("Save", Save, "Save space-dependent quantity once, with auto filename") DeclFunc("SaveAs", SaveAs, "Save space-dependent quantity with custom filename") DeclLValue("FilenameFormat", &fformat{}, "printf formatting string for output filenames.") DeclLValue("OutputFormat", &oformat{}, "Format for data files: OVF1_TEXT, OVF1_BINARY, OVF2_TEXT or OVF2_BINARY") DeclROnly("OVF1_BINARY", OVF1_BINARY, "OutputFormat = OVF1_BINARY sets binary OVF1 output") DeclROnly("OVF2_BINARY", OVF2_BINARY, "OutputFormat = OVF2_BINARY sets binary OVF2 output") DeclROnly("OVF1_TEXT", OVF1_TEXT, "OutputFormat = OVF1_TEXT sets text OVF1 output") DeclROnly("OVF2_TEXT", OVF2_TEXT, "OutputFormat = OVF2_TEXT sets text OVF2 output") DeclROnly("DUMP", DUMP, "OutputFormat = DUMP sets text DUMP output") DeclFunc("Snapshot", Snapshot, "Save image of quantity") DeclFunc("SnapshotAs", SnapshotAs, "Save image of quantity with custom filename") DeclVar("SnapshotFormat", &SnapshotFormat, "Image format for snapshots: jpg, png or gif.") } var ( FilenameFormat = "%s%06d" // formatting string for auto filenames. SnapshotFormat = "jpg" // user-settable snapshot format outputFormat = OVF2_BINARY // user-settable output format ) type fformat struct{} func (*fformat) Eval() interface{} { return FilenameFormat } func (*fformat) SetValue(v interface{}) { drainOutput(); FilenameFormat = v.(string) } func (*fformat) Type() reflect.Type { return reflect.TypeOf("") } type oformat struct{} func (*oformat) Eval() interface{} { return outputFormat } func (*oformat) SetValue(v interface{}) { drainOutput(); outputFormat = v.(OutputFormat) } func (*oformat) Type() reflect.Type { return reflect.TypeOf(OutputFormat(OVF2_BINARY)) } // Save once, with auto file name func Save(q Quantity) { qname := NameOf(q) fname := autoFname(NameOf(q), outputFormat, autonum[qname]) SaveAs(q, fname) autonum[qname]++ } // Save under given file name (transparent async I/O). func SaveAs(q Quantity, fname string) { if !strings.HasPrefix(fname, OD()) { fname = OD() + fname // don't clean, turns http:// in http:/ } if path.Ext(fname) == "" { fname += ("." + StringFromOutputFormat[outputFormat]) } buffer := ValueOf(q) // TODO: check and optimize for Buffer() defer cuda.Recycle(buffer) info := data.Meta{Time: Time, Name: NameOf(q), Unit: UnitOf(q), CellSize: MeshOf(q).CellSize()} data := buffer.HostCopy() // must be copy (async io) queOutput(func() { saveAs_sync(fname, data, info, outputFormat) }) } // Save image once, with auto file name func Snapshot(q Quantity) { qname := NameOf(q) fname := fmt.Sprintf(OD()+FilenameFormat+"."+SnapshotFormat, qname, autonum[qname]) s := ValueOf(q) defer cuda.Recycle(s) data := s.HostCopy() // must be copy (asyncio) queOutput(func() { snapshot_sync(fname, data) }) autonum[qname]++ } func SnapshotAs(q Quantity, fname string) { if !strings.HasPrefix(fname, OD()) { fname = OD() + fname // don't clean, turns http:// in http:/ } if path.Ext(fname) == "" { fname += ("." + StringFromOutputFormat[outputFormat]) } s := ValueOf(q) defer cuda.Recycle(s) data := s.HostCopy() // must be copy (asyncio) queOutput(func() { snapshot_sync(fname, data) }) } // synchronous snapshot func snapshot_sync(fname string, output *data.Slice) { f, err := httpfs.Create(fname) util.FatalErr(err) defer f.Close() draw.RenderFormat(f, output, "auto", "auto", arrowSize, path.Ext(fname)) } // synchronous save func saveAs_sync(fname string, s *data.Slice, info data.Meta, format OutputFormat) { f, err := httpfs.Create(fname) util.FatalErr(err) defer f.Close() switch format { case OVF1_TEXT: oommf.WriteOVF1(f, s, info, "text") case OVF1_BINARY: oommf.WriteOVF1(f, s, info, "binary 4") case OVF2_TEXT: oommf.WriteOVF2(f, s, info, "text") case OVF2_BINARY: oommf.WriteOVF2(f, s, info, "binary 4") case DUMP: dump.Write(f, s, info) default: panic("invalid output format") } } type OutputFormat int const ( OVF1_TEXT OutputFormat = iota + 1 OVF1_BINARY OVF2_TEXT OVF2_BINARY DUMP ) var ( StringFromOutputFormat = map[OutputFormat]string{ OVF1_TEXT: "ovf", OVF1_BINARY: "ovf", OVF2_TEXT: "ovf", OVF2_BINARY: "ovf", DUMP: "dump"} ) 3-3.11.1/engine/scalar_excitation.go000066400000000000000000000077151503346766200172550ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/script" "github.com/mumax/3/util" "reflect" ) // An excitation, typically field or current, // can be defined region-wise plus extra mask*multiplier terms. type ScalarExcitation struct { name string perRegion RegionwiseScalar // Region-based excitation extraTerms []mulmask // add extra mask*multiplier terms } func NewScalarExcitation(name, unit, desc string) *ScalarExcitation { e := new(ScalarExcitation) e.name = name e.perRegion.init("_"+name+"_perRegion", unit, desc, nil) // name starts with underscore: unexported DeclLValue(name, e, cat(desc, unit)) return e } func (p *ScalarExcitation) MSlice() cuda.MSlice { buf, r := p.Slice() util.Assert(r == true) return cuda.ToMSlice(buf) } func (e *ScalarExcitation) AddTo(dst *data.Slice) { if !e.perRegion.isZero() { cuda.RegionAddS(dst, e.perRegion.gpuLUT1(), regions.Gpu()) } for _, t := range e.extraTerms { var mul float32 = 1 if t.mul != nil { mul = float32(t.mul()) } cuda.Madd2(dst, dst, t.mask, 1, mul) } } func (e *ScalarExcitation) isZero() bool { return e.perRegion.isZero() && len(e.extraTerms) == 0 } func (e *ScalarExcitation) Slice() (*data.Slice, bool) { buf := cuda.Buffer(e.NComp(), e.Mesh().Size()) cuda.Zero(buf) e.AddTo(buf) return buf, true } // After resizing the mesh, the extra terms don't fit the grid anymore // and there is no reasonable way to resize them. So remove them and have // the user re-add them. func (e *ScalarExcitation) RemoveExtraTerms() { if len(e.extraTerms) == 0 { return } LogOut("REMOVING EXTRA TERMS FROM", e.Name()) for _, m := range e.extraTerms { m.mask.Free() } e.extraTerms = nil } // Add an extra mask*multiplier term to the excitation. func (e *ScalarExcitation) Add(mask *data.Slice, f script.ScalarFunction) { var mul func() float64 if f != nil { if IsConst(f) { val := f.Float() mul = func() float64 { return val } } else { mul = func() float64 { return f.Float() } } } e.AddGo(mask, mul) } // An Add(mask, f) equivalent for Go use func (e *ScalarExcitation) AddGo(mask *data.Slice, mul func() float64) { if mask != nil { checkNaN(mask, e.Name()+".add()") // TODO: in more places mask = data.Resample(mask, e.Mesh().Size()) mask = assureGPU(mask) } e.extraTerms = append(e.extraTerms, mulmask{mul, mask}) } func (e *ScalarExcitation) SetRegion(region int, f script.ScalarFunction) { e.perRegion.SetRegion(region, f) } func (e *ScalarExcitation) SetValue(v interface{}) { e.perRegion.SetValue(v) } func (e *ScalarExcitation) Set(v float64) { e.perRegion.setRegions(0, NREGION, []float64{v}) } func (e *ScalarExcitation) getRegion(region int) []float64 { return e.perRegion.getRegion(region) } // for gui func (e *ScalarExcitation) SetRegionFn(region int, f func() [3]float64) { e.perRegion.setFunc(region, region+1, func() []float64 { return slice(f()) }) } func (e *ScalarExcitation) average() float64 { return qAverageUniverse(e)[0] } func (e *ScalarExcitation) Average() float64 { return e.average() } func (e *ScalarExcitation) IsUniform() bool { return e.perRegion.IsUniform() } func (e *ScalarExcitation) Name() string { return e.name } func (e *ScalarExcitation) Unit() string { return e.perRegion.Unit() } func (e *ScalarExcitation) NComp() int { return e.perRegion.NComp() } func (e *ScalarExcitation) Mesh() *data.Mesh { return Mesh() } func (e *ScalarExcitation) Region(r int) *vOneReg { return vOneRegion(e, r) } func (e *ScalarExcitation) Comp(c int) ScalarField { return Comp(e, c) } func (e *ScalarExcitation) Eval() interface{} { return e } func (e *ScalarExcitation) Type() reflect.Type { return reflect.TypeOf(new(ScalarExcitation)) } func (e *ScalarExcitation) InputType() reflect.Type { return script.ScalarFunction_t } func (e *ScalarExcitation) EvalTo(dst *data.Slice) { EvalTo(e, dst) } 3-3.11.1/engine/script.go000066400000000000000000000055771503346766200150710ustar00rootroot00000000000000package engine // declare functionality for interpreted input scripts import ( "github.com/mumax/3/httpfs" "github.com/mumax/3/script" "reflect" ) func CompileFile(fname string) (*script.BlockStmt, error) { bytes, err := httpfs.Read(fname) if err != nil { return nil, err } return World.Compile(string(bytes)) } func Eval(code string) { tree, err := World.Compile(code) if err != nil { LogIn(code) LogErr(err.Error()) return } LogIn(rmln(tree.Format())) tree.Eval() } func Eval1Line(code string) interface{} { tree, err := World.Compile(code) if err != nil { LogErr(err.Error()) return nil } if len(tree.Children) != 1 { LogErr("expected single statement:" + code) return nil } return tree.Children[0].Eval() } // holds the script state (variables etc) var World = script.NewWorld() // Add a function to the script world func DeclFunc(name string, f interface{}, doc string) { World.Func(name, f, doc) } // Add a constant to the script world func DeclConst(name string, value float64, doc string) { World.Const(name, value, doc) } // Add a read-only variable to the script world. // It can be changed, but not by the user. func DeclROnly(name string, value interface{}, doc string) { World.ROnly(name, value, doc) GUIAdd(name, value) } func Export(q interface { Name() string Unit() string }, doc string) { DeclROnly(q.Name(), q, cat(doc, q.Unit())) } // Add a (pointer to) variable to the script world func DeclVar(name string, value interface{}, doc string) { World.Var(name, value, doc) GUIAdd(name, value) } // Hack for fixing the closure caveat: // Defines "t", the time variable, handled specially by Fix() func DeclTVar(name string, value interface{}, doc string) { World.TVar(name, value, doc) GUIAdd(name, value) } // Add an LValue to the script world. // Assign to LValue invokes SetValue() func DeclLValue(name string, value LValue, doc string) { World.LValue(name, newLValueWrapper(value), doc) GUIAdd(name, value) } // LValue is settable type LValue interface { SetValue(interface{}) // assigns a new value Eval() interface{} // evaluate and return result (nil for void) Type() reflect.Type // type that can be assigned and will be returned by Eval } // evaluate code, exit on error (behavior for input files) func EvalFile(code *script.BlockStmt) { for i := range code.Children { formatted := rmln(script.Format(code.Node[i])) LogIn(formatted) code.Children[i].Eval() } } // wraps LValue and provides empty Child() type lValueWrapper struct { LValue } func newLValueWrapper(lv LValue) script.LValue { return &lValueWrapper{lv} } func (w *lValueWrapper) Child() []script.Expr { return nil } func (w *lValueWrapper) Fix() script.Expr { return script.NewConst(w) } func (w *lValueWrapper) InputType() reflect.Type { if i, ok := w.LValue.(interface { InputType() reflect.Type }); ok { return i.InputType() } else { return w.Type() } } 3-3.11.1/engine/shape.go000066400000000000000000000233351503346766200146550ustar00rootroot00000000000000package engine import ( "image" _ "image/jpeg" _ "image/png" "math" "github.com/mumax/3/data" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) func init() { DeclFunc("Ellipsoid", Ellipsoid, "3D Ellipsoid with axes in meter") DeclFunc("Ellipse", Ellipse, "2D Ellipse with axes in meter") DeclFunc("Cone", Cone, "3D Cone with diameter and height in meter. The base is at z=0. If the height is positive, the tip points in the +z direction.") DeclFunc("Cylinder", Cylinder, "3D Cylinder with diameter and height in meter") DeclFunc("Circle", Circle, "2D Circle with diameter in meter") DeclFunc("Cuboid", Cuboid, "Cuboid with sides in meter") DeclFunc("Rect", Rect, "2D rectangle with size in meter") DeclFunc("Square", Square, "2D square with size in meter") DeclFunc("Triangle", Triangle, "2D triangle with vertices (x0, y0), (x1, y1) and (x2, y2)") DeclFunc("XRange", XRange, "Part of space between x1 (inclusive) and x2 (exclusive), in meter") DeclFunc("YRange", YRange, "Part of space between y1 (inclusive) and y2 (exclusive), in meter") DeclFunc("ZRange", ZRange, "Part of space between z1 (inclusive) and z2 (exclusive), in meter") DeclFunc("Layers", Layers, "Part of space between cell layer1 (inclusive) and layer2 (exclusive), in integer indices") DeclFunc("Layer", Layer, "Single layer (along z), by integer index starting from 0") DeclFunc("Universe", Universe, "Entire space") DeclFunc("Cell", Cell, "Single cell with given integer index (i, j, k)") DeclFunc("ImageShape", ImageShape, "Use black/white image as shape") DeclFunc("GrainRoughness", GrainRoughness, "Grainy surface with different heights per grain "+ "with a typical grain size (first argument), minimal height (second argument), and maximal "+ "height (third argument). The last argument is a seed for the random number generator.") } // geometrical shape for setting sample geometry type Shape func(x, y, z float64) bool // Ellipsoid with given diameters func Ellipsoid(diamx, diamy, diamz float64) Shape { return func(x, y, z float64) bool { return sqr64(x/diamx)+sqr64(y/diamy)+sqr64(z/diamz) <= 0.25 } } func Ellipse(diamx, diamy float64) Shape { return Ellipsoid(diamx, diamy, math.Inf(1)) } // 3D Cone with base at z=0 and vertex at z=height. func Cone(diam, height float64) Shape { return func(x, y, z float64) bool { return (height-z)*z >= 0 && sqr64(x/diam)+sqr64(y/diam) <= 0.25*sqr64(1-z/height) } } func Circle(diam float64) Shape { return Cylinder(diam, math.Inf(1)) } // cylinder along z. func Cylinder(diam, height float64) Shape { return func(x, y, z float64) bool { return z <= height/2 && z >= -height/2 && sqr64(x/diam)+sqr64(y/diam) <= 0.25 } } // 3D Rectangular slab with given sides. func Cuboid(sidex, sidey, sidez float64) Shape { return func(x, y, z float64) bool { rx, ry, rz := sidex/2, sidey/2, sidez/2 return x < rx && x > -rx && y < ry && y > -ry && z < rz && z > -rz } } // 2D Rectangle with given sides. func Rect(sidex, sidey float64) Shape { return func(x, y, z float64) bool { rx, ry := sidex/2, sidey/2 return x < rx && x > -rx && y < ry && y > -ry } } // 2D square with given side. func Square(side float64) Shape { return Rect(side, side) } // 2D triangle with given vertices. func Triangle(x0, y0, x1, y1, x2, y2 float64) Shape { denom := x0*(y1-y2) + x1*(y2-y0) + x2*(y0-y1) // 2 * area if denom == 0 { return func(x, y, z float64) bool { return false } } A2m1 := 1 / denom Sc := A2m1 * (y0*x2 - x0*y2) Sx := A2m1 * (y2 - y0) Sy := A2m1 * (x0 - x2) Tc := A2m1 * (x0*y1 - y0*x1) Tx := A2m1 * (y0 - y1) Ty := A2m1 * (x1 - x0) return func(x, y, z float64) bool { // barycentric coordinates s := Sc + Sx*x + Sy*y t := Tc + Tx*x + Ty*y return ((0 <= s) && (0 <= t) && (s+t <= 1)) } } // All cells with x-coordinate between a and b func XRange(a, b float64) Shape { return func(x, y, z float64) bool { return x >= a && x < b } } // All cells with y-coordinate between a and b func YRange(a, b float64) Shape { return func(x, y, z float64) bool { return y >= a && y < b } } // All cells with z-coordinate between a and b func ZRange(a, b float64) Shape { return func(x, y, z float64) bool { return z >= a && z < b } } // Cell layers #a (inclusive) up to #b (exclusive). func Layers(a, b int) Shape { Nz := Mesh().Size()[Z] if a < 0 || a > Nz || b < 0 || b < a { util.Fatal("layers ", a, ":", b, " out of bounds (0 - ", Nz, ")") } c := Mesh().CellSize()[Z] z1 := Index2Coord(0, 0, a)[Z] - c/2 z2 := Index2Coord(0, 0, b)[Z] - c/2 return ZRange(z1, z2) } func Layer(index int) Shape { return Layers(index, index+1) } // Single cell with given index func Cell(ix, iy, iz int) Shape { c := Mesh().CellSize() pos := Index2Coord(ix, iy, iz) x1 := pos[X] - c[X]/2 y1 := pos[Y] - c[Y]/2 z1 := pos[Z] - c[Z]/2 x2 := pos[X] + c[X]/2 y2 := pos[Y] + c[Y]/2 z2 := pos[Z] + c[Z]/2 return func(x, y, z float64) bool { return x > x1 && x < x2 && y > y1 && y < y2 && z > z1 && z < z2 } } func Universe() Shape { return universe } // The entire space. func universe(x, y, z float64) bool { return true } func ImageShape(fname string) Shape { r, err1 := httpfs.Open(fname) CheckRecoverable(err1) defer r.Close() img, _, err2 := image.Decode(r) CheckRecoverable(err2) width := img.Bounds().Max.X height := img.Bounds().Max.Y // decode image into bool matrix for fast pixel lookup inside := make([][]bool, height) for iy := range inside { inside[iy] = make([]bool, width) } for iy := 0; iy < height; iy++ { for ix := 0; ix < width; ix++ { r, g, b, a := img.At(ix, height-1-iy).RGBA() if a > 128 && r+g+b < (0xFFFF*3)/2 { inside[iy][ix] = true } } } // stretch the image onto the gridsize c := Mesh().CellSize() cx, cy := c[X], c[Y] N := Mesh().Size() nx, ny := float64(N[X]), float64(N[Y]) w, h := float64(width), float64(height) return func(x, y, z float64) bool { ix := int((w/nx)*(x/cx) + 0.5*w) iy := int((h/ny)*(y/cy) + 0.5*h) if ix < 0 || ix >= width || iy < 0 || iy >= height { return false } else { return inside[iy][ix] } } } func VoxelShape(voxels *data.Slice, a, b, c float64) Shape { //component dimension check, expect 1D points if voxels.NComp() != 1 { util.Fatal("Voxel array fed has a wrong value dimension: ", voxels.NComp(), ", Aborting!") } //cut FP array into bool array arrSize := voxels.Size() voxelArr := make([]bool, arrSize[0]*arrSize[1]*arrSize[2]) for ix := 0; ix < arrSize[0]; ix++ { for iy := 0; iy < arrSize[1]; iy++ { for iz := 0; iz < arrSize[2]; iz++ { voxelArr[iz*arrSize[0]*arrSize[1]+iy*arrSize[0]+ix] = voxels.Get(0, ix, iy, iz) > 0.5 } } } //the predicate voxelSize := [3]float64{a, b, c} return func(x, y, z float64) bool { var ind [3]int coord := [3]float64{x, y, z} for c := 0; c < 3; c++ { //truncation applies floor by default ind[c] = int(coord[c]/voxelSize[c] + float64(arrSize[c])/2) if ind[c] < 0 || ind[c] >= arrSize[c] { //there is no geometry outside of the imported array return false } } //if not fallen through check against the previous array return voxelArr[ind[2]*arrSize[0]*arrSize[1]+ind[1]*arrSize[0]+ind[0]] } } func GrainRoughness(grainsize, zmin, zmax float64, seed int) Shape { t := newTesselation(grainsize, 256, int64(seed)) return func(x, y, z float64) bool { if z <= zmin { return true } if z >= zmax { return false } r := t.RegionOf(x, y, z) return (z-zmin)/(zmax-zmin) < (float64(r) / 256) } } // Transl returns a translated copy of the shape. func (s Shape) Transl(dx, dy, dz float64) Shape { return func(x, y, z float64) bool { return s(x-dx, y-dy, z-dz) } } // Infinitely repeats the shape with given period in x, y, z. // A period of 0 or infinity means no repetition. func (s Shape) Repeat(periodX, periodY, periodZ float64) Shape { return func(x, y, z float64) bool { return s(fmod(x, periodX), fmod(y, periodY), fmod(z, periodZ)) } } func fmod(a, b float64) float64 { if b == 0 || math.IsInf(b, 1) { return a } if math.Abs(a) > b/2 { return sign(a) * (math.Mod(math.Abs(a+b/2), b) - b/2) } else { return a } } // Scale returns a scaled copy of the shape. func (s Shape) Scale(sx, sy, sz float64) Shape { return func(x, y, z float64) bool { return s(x/sx, y/sy, z/sz) } } // Rotates the shape around the Z-axis, over θ radians. func (s Shape) RotZ(θ float64) Shape { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) bool { x_ := x*cos + y*sin y_ := -x*sin + y*cos return s(x_, y_, z) } } // Rotates the shape around the Y-axis, over θ radians. func (s Shape) RotY(θ float64) Shape { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) bool { x_ := x*cos - z*sin z_ := x*sin + z*cos return s(x_, y, z_) } } // Rotates the shape around the X-axis, over θ radians. func (s Shape) RotX(θ float64) Shape { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) bool { y_ := y*cos + z*sin z_ := -y*sin + z*cos return s(x, y_, z_) } } // Union of shapes a and b (logical OR). func (a Shape) Add(b Shape) Shape { return func(x, y, z float64) bool { return a(x, y, z) || b(x, y, z) } } // Intersection of shapes a and b (logical AND). func (a Shape) Intersect(b Shape) Shape { return func(x, y, z float64) bool { return a(x, y, z) && b(x, y, z) } } // Inverse (outside) of shape (logical NOT). func (s Shape) Inverse() Shape { return func(x, y, z float64) bool { return !s(x, y, z) } } // Removes b from a (logical a AND NOT b) func (a Shape) Sub(b Shape) Shape { return func(x, y, z float64) bool { return a(x, y, z) && !b(x, y, z) } } // Logical XOR of shapes a and b func (a Shape) Xor(b Shape) Shape { return func(x, y, z float64) bool { A, B := a(x, y, z), b(x, y, z) return (A || B) && !(A && B) } } func sqr64(x float64) float64 { return x * x } 3-3.11.1/engine/shift.go000066400000000000000000000061131503346766200146650ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( TotalShift, TotalYShift float64 // accumulated window shift (X and Y) in meter ShiftMagL, ShiftMagR, ShiftMagU, ShiftMagD data.Vector // when shifting m, put these value at the left/right edge. ShiftM, ShiftGeom, ShiftRegions bool = true, true, true // should shift act on magnetization, geometry, regions? EdgeCarryShift bool = false // Use the values of M at the border for the new cells ) func init() { DeclFunc("Shift", Shift, "Shifts the simulation by +1/-1 cells along X") DeclVar("EdgeCarryShift", &EdgeCarryShift, "Whether to use the current magnetization at the border for the cells inserted by Shift (default=false)") DeclVar("ShiftMagL", &ShiftMagL, "Upon shift, insert this magnetization from the left") DeclVar("ShiftMagR", &ShiftMagR, "Upon shift, insert this magnetization from the right") DeclVar("ShiftMagU", &ShiftMagU, "Upon shift, insert this magnetization from the top") DeclVar("ShiftMagD", &ShiftMagD, "Upon shift, insert this magnetization from the bottom") DeclVar("ShiftM", &ShiftM, "Whether Shift() acts on magnetization") DeclVar("ShiftGeom", &ShiftGeom, "Whether Shift() acts on geometry") DeclVar("ShiftRegions", &ShiftRegions, "Whether Shift() acts on regions") DeclVar("TotalShift", &TotalShift, "Amount by which the simulation has been shifted along the x-axis (m).") } // position of the window lab frame func GetShiftPos() float64 { return -TotalShift } func GetShiftYPos() float64 { return -TotalYShift } // shift the simulation window over dx cells in X direction func Shift(dx int) { TotalShift += float64(dx) * Mesh().CellSize()[X] // needed to re-init geom, regions if ShiftM { shiftMag(M.Buffer(), dx) // TODO: M.shift? } if ShiftRegions { regions.shift(dx) } if ShiftGeom { geometry.shift(dx) } M.normalize() } func shiftMag(m *data.Slice, dx int) { m2 := cuda.Buffer(1, m.Size()) defer cuda.Recycle(m2) for c := 0; c < m.NComp(); c++ { comp := m.Comp(c) if EdgeCarryShift { cuda.ShiftEdgeCarryX(m2, comp, m.Comp((c+1)%3), m.Comp((c+2)%3), dx, float32(ShiftMagL[c]), float32(ShiftMagR[c])) } else { cuda.ShiftX(m2, comp, dx, float32(ShiftMagL[c]), float32(ShiftMagR[c])) } data.Copy(comp, m2) // str0 ? } } // shift the simulation window over dy cells in Y direction func YShift(dy int) { TotalYShift += float64(dy) * Mesh().CellSize()[Y] // needed to re-init geom, regions if ShiftM { shiftMagY(M.Buffer(), dy) } if ShiftRegions { regions.shiftY(dy) } if ShiftGeom { geometry.shiftY(dy) } M.normalize() } func shiftMagY(m *data.Slice, dy int) { m2 := cuda.Buffer(1, m.Size()) defer cuda.Recycle(m2) for c := 0; c < m.NComp(); c++ { comp := m.Comp(c) if EdgeCarryShift { cuda.ShiftEdgeCarryY(m2, comp, m.Comp((c+1)%3), m.Comp((c+2)%3), dy, float32(ShiftMagU[c]), float32(ShiftMagD[c])) } else { cuda.ShiftY(m2, comp, dy, float32(ShiftMagU[c]), float32(ShiftMagD[c])) } data.Copy(comp, m2) // str0 ? } } 3-3.11.1/engine/table.go000066400000000000000000000102061503346766200146350ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/httpfs" "github.com/mumax/3/script" "github.com/mumax/3/timer" "github.com/mumax/3/util" "io" "sync" "time" ) var Table = *newTable("table") // output handle for tabular data (average magnetization etc.) const TableAutoflushRate = 5 // auto-flush table every X seconds func init() { DeclFunc("TableAdd", TableAdd, "Add quantity as a column to the data table.") DeclFunc("TableAddVar", TableAddVariable, "Add user-defined variable + name + unit to data table.") DeclFunc("TableSave", TableSave, "Save the data table right now (appends one line).") DeclFunc("TableAutoSave", TableAutoSave, "Auto-save the data table every period (s). Zero disables save.") DeclFunc("TablePrint", TablePrint, "Print anyting in the data table") Table.Add(&M) } type DataTable struct { output interface { io.Writer Flush() error } info outputs []Quantity autosave flushlock sync.Mutex } func (t *DataTable) Write(p []byte) (int, error) { n, err := t.output.Write(p) util.FatalErr(err) return n, err } func (t *DataTable) Flush() error { if t.output == nil { return nil } if cuda.Synchronous { timer.Start("io") } err := t.output.Flush() if cuda.Synchronous { timer.Stop("io") } util.FatalErr(err) return err } func newTable(name string) *DataTable { t := new(DataTable) t.name = name return t } func TableAdd(col Quantity) { Table.Add(col) } func TableAddVariable(x script.ScalarFunction, name, unit string) { Table.AddVariable(x, name, unit) } func (t *DataTable) AddVariable(x script.ScalarFunction, name, unit string) { TableAdd(&userVar{x, name, unit}) } type userVar struct { value script.ScalarFunction name, unit string } func (x *userVar) Name() string { return x.name } func (x *userVar) NComp() int { return 1 } func (x *userVar) Unit() string { return x.unit } func (x *userVar) average() []float64 { return []float64{x.value.Float()} } func (x *userVar) EvalTo(dst *data.Slice) { avg := x.average() for c := 0; c < x.NComp(); c++ { cuda.Memset(dst.Comp(c), float32(avg[c])) } } func TableSave() { Table.Save() } func TableAutoSave(period float64) { Table.autosave = autosave{period, Time, -1, nil} // count -1 allows output on t=0 } func (t *DataTable) Add(output Quantity) { if t.inited() { util.Fatal("data table add ", NameOf(output), ": need to add quantity before table is output the first time") } t.outputs = append(t.outputs, output) } func (t *DataTable) Save() { t.flushlock.Lock() // flush during write gives errShortWrite defer t.flushlock.Unlock() if cuda.Synchronous { timer.Start("io") } t.init() fprint(t, Time) for _, o := range t.outputs { vec := AverageOf(o) for _, v := range vec { fprint(t, "\t", float32(v)) } } fprintln(t) //t.flush() t.count++ if cuda.Synchronous { timer.Stop("io") } } func (t *DataTable) Println(msg ...interface{}) { t.init() fprintln(t, msg...) } func TablePrint(msg ...interface{}) { Table.Println(msg...) } // open writer and write header func (t *DataTable) init() { if t.inited() { return } f, err := httpfs.Create(OD() + t.name + ".txt") util.FatalErr(err) t.output = f // write header fprint(t, "# t (s)") for _, o := range t.outputs { if o.NComp() == 1 { fprint(t, "\t", NameOf(o), " (", UnitOf(o), ")") } else { for c := 0; c < o.NComp(); c++ { fprint(t, "\t", NameOf(o)+string('x'+c), " (", UnitOf(o), ")") } } } fprintln(t) t.Flush() // periodically flush so GUI shows graph, // but don't flush after every output for performance // (httpfs flush is expensive) go func() { for { time.Sleep(TableAutoflushRate * time.Second) Table.flush() } }() } func (t *DataTable) inited() bool { return t.output != nil } func (t *DataTable) flush() { t.flushlock.Lock() defer t.flushlock.Unlock() t.Flush() } // Safe fmt.Fprint, will fail on error func fprint(out io.Writer, x ...interface{}) { _, err := fmt.Fprint(out, x...) util.FatalErr(err) } // Safe fmt.Fprintln, will fail on error func fprintln(out io.Writer, x ...interface{}) { _, err := fmt.Fprintln(out, x...) util.FatalErr(err) } 3-3.11.1/engine/temperature.go000066400000000000000000000105671503346766200161150ustar00rootroot00000000000000package engine import ( "fmt" "math" "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/curand" "github.com/mumax/3/data" "github.com/mumax/3/mag" "github.com/mumax/3/util" ) var ( Temp = NewScalarParam("Temp", "K", "Temperature") E_therm = NewScalarValue("E_therm", "J", "Thermal energy", GetThermalEnergy) Edens_therm = NewScalarField("Edens_therm", "J/m3", "Thermal energy density", AddThermalEnergyDensity) B_therm thermField // Thermal effective field (T) ) var AddThermalEnergyDensity = makeEdensAdder(&B_therm, -1) var PrintedWarningTempOddGrid = false // Will be set to true if the warning about odd temperature has been printed already, to avoid spam. // thermField calculates and caches thermal noise. type thermField struct { seed int64 // seed for generator generator curand.Generator // noise *data.Slice // noise buffer step int // solver step corresponding to noise dt float64 // solver timestep corresponding to noise } func init() { DeclFunc("ThermSeed", ThermSeed, "Set a random seed for thermal noise") registerEnergy(GetThermalEnergy, AddThermalEnergyDensity) B_therm.step = -1 // invalidate noise cache DeclROnly("B_therm", &B_therm, "Thermal field (T)") } func (b *thermField) AddTo(dst *data.Slice) { if !Temp.isZero() { b.update() cuda.Add(dst, dst, b.noise) } } func (b *thermField) update() { // we need to fix the time step here because solver will not yet have done it before the first step. // FixDt as an lvalue that sets Dt_si on change might be cleaner. if FixDt != 0 { Dt_si = FixDt } if b.generator == 0 { b.generator = curand.CreateGenerator(curand.PSEUDO_DEFAULT) b.generator.SetSeed(b.seed) } if b.noise == nil { b.noise = cuda.NewSlice(b.NComp(), b.Mesh().Size()) // when noise was (re-)allocated it's invalid for sure. B_therm.step = -1 B_therm.dt = -1 } if Temp.isZero() { cuda.Memset(b.noise, 0, 0, 0) b.step = NSteps b.dt = Dt_si return } // keep constant during time step if NSteps == b.step && Dt_si == b.dt { return } // after a bad step the timestep is rescaled and the noise should be rescaled accordingly, instead of redrawing the random numbers if NSteps == b.step && Dt_si != b.dt { for c := 0; c < 3; c++ { cuda.Madd2(b.noise.Comp(c), b.noise.Comp(c), b.noise.Comp(c), float32(math.Sqrt(b.dt/Dt_si)), 0.) } b.dt = Dt_si return } if FixDt == 0 { Refer("leliaert2017") //uncomment to not allow adaptive step //util.Fatal("Finite temperature requires fixed time step. Set FixDt != 0.") } N := Mesh().NCell() if !PrintedWarningTempOddGrid && N%2 > 0 { // T is nonzero if we have gotten this far. As noted in issue #314, this means the grid size must be even. PrintedWarningTempOddGrid = true warnStr := "// WARNING: nonzero temperature requires an even amount of grid cells,\n" + "// but all axes have an odd number of cells: %v.\n" + "// This may cause a CURAND_STATUS_LENGTH_NOT_MULTIPLE error." // Error is likely when the largest factor is >127 util.Log(fmt.Sprintf(warnStr, Mesh().Size())) } k2_VgammaDt := 2 * mag.Kb / (GammaLL * cellVolume() * Dt_si) noise := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(noise) const mean = 0 const stddev = 1 dst := b.noise ms := Msat.MSlice() defer ms.Recycle() temp := Temp.MSlice() defer temp.Recycle() alpha := Alpha.MSlice() defer alpha.Recycle() for i := 0; i < 3; i++ { b.generator.GenerateNormal(uintptr(noise.DevPtr(0)), int64(N), mean, stddev) cuda.SetTemperature(dst.Comp(i), noise, k2_VgammaDt, ms, temp, alpha) } b.step = NSteps b.dt = Dt_si } func GetThermalEnergy() float64 { if Temp.isZero() || relaxing { return 0 } else { return -cellVolume() * dot(&M_full, &B_therm) } } // Seeds the thermal noise generator func ThermSeed(seed int) { B_therm.seed = int64(seed) if B_therm.generator != 0 { B_therm.generator.SetSeed(B_therm.seed) } } func (b *thermField) Mesh() *data.Mesh { return Mesh() } func (b *thermField) NComp() int { return 3 } func (b *thermField) Name() string { return "Thermal field" } func (b *thermField) Unit() string { return "T" } func (b *thermField) average() []float64 { return qAverageUniverse(b) } func (b *thermField) EvalTo(dst *data.Slice) { EvalTo(b, dst) } func (b *thermField) Slice() (*data.Slice, bool) { b.update() return b.noise, false } 3-3.11.1/engine/torque.go000066400000000000000000000125331503346766200150720ustar00rootroot00000000000000package engine import ( "reflect" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( Alpha = NewScalarParam("alpha", "", "Landau-Lifshitz damping constant") Xi = NewScalarParam("xi", "", "Non-adiabaticity of spin-transfer-torque") Pol = NewScalarParam("Pol", "", "Electrical current polarization") Lambda = NewScalarParam("Lambda", "", "Slonczewski Λ parameter") EpsilonPrime = NewScalarParam("EpsilonPrime", "", "Slonczewski secondairy STT term ε'") FrozenSpins = NewScalarParam("frozenspins", "", "Defines spins that should be fixed") // 1 - frozen, 0 - free. TODO: check if it only contains 0/1 values FreeLayerThickness = NewScalarParam("FreeLayerThickness", "m", "Slonczewski free layer thickness (if set to zero (default), then the thickness will be deduced from the mesh size)") FixedLayer = NewExcitation("FixedLayer", "", "Slonczewski fixed layer polarization") Torque = NewVectorField("torque", "T", "Total torque/γ0", SetTorque) LLTorque = NewVectorField("LLtorque", "T", "Landau-Lifshitz torque/γ0", SetLLTorque) STTorque = NewVectorField("STTorque", "T", "Spin-transfer torque/γ0", AddSTTorque) J = NewExcitation("J", "A/m2", "Electrical current density") MaxTorque = NewScalarValue("maxTorque", "T", "Maximum torque/γ0, over all cells", GetMaxTorque) GammaLL float64 = 1.7595e11 // Gyromagnetic ratio of spins, in rad/Ts Precess = true DisableZhangLiTorque = false DisableSlonczewskiTorque = false fixedLayerPosition = FIXEDLAYER_TOP // instructs mumax3 how free and fixed layers are stacked along +z direction ) func init() { Pol.setUniform([]float64{1}) // default spin polarization Lambda.Set(1) // sensible default value (?). DeclVar("GammaLL", &GammaLL, "Gyromagnetic ratio in rad/Ts") DeclVar("DisableZhangLiTorque", &DisableZhangLiTorque, "Disables Zhang-Li torque (default=false)") DeclVar("DisableSlonczewskiTorque", &DisableSlonczewskiTorque, "Disables Slonczewski torque (default=false)") DeclVar("DoPrecess", &Precess, "Enables LL precession (default=true)") DeclLValue("FixedLayerPosition", &flposition{}, "Position of the fixed layer: FIXEDLAYER_TOP, FIXEDLAYER_BOTTOM (default=FIXEDLAYER_TOP)") DeclROnly("FIXEDLAYER_TOP", FIXEDLAYER_TOP, "FixedLayerPosition = FIXEDLAYER_TOP instructs mumax3 that fixed layer is on top of the free layer") DeclROnly("FIXEDLAYER_BOTTOM", FIXEDLAYER_BOTTOM, "FixedLayerPosition = FIXEDLAYER_BOTTOM instructs mumax3 that fixed layer is underneath of the free layer") } // Sets dst to the current total torque func SetTorque(dst *data.Slice) { SetLLTorque(dst) AddSTTorque(dst) FreezeSpins(dst) } // Sets dst to the current Landau-Lifshitz torque func SetLLTorque(dst *data.Slice) { SetEffectiveField(dst) // calc and store B_eff alpha := Alpha.MSlice() defer alpha.Recycle() if Precess { cuda.LLTorque(dst, M.Buffer(), dst, alpha) // overwrite dst with torque } else { cuda.LLNoPrecess(dst, M.Buffer(), dst) } } // Adds the current spin transfer torque to dst func AddSTTorque(dst *data.Slice) { if J.isZero() { return } util.AssertMsg(!Pol.isZero(), "spin polarization should not be 0") jspin, rec := J.Slice() if rec { defer cuda.Recycle(jspin) } fl, rec := FixedLayer.Slice() if rec { defer cuda.Recycle(fl) } if !DisableZhangLiTorque { msat := Msat.MSlice() defer msat.Recycle() j := J.MSlice() defer j.Recycle() alpha := Alpha.MSlice() defer alpha.Recycle() xi := Xi.MSlice() defer xi.Recycle() pol := Pol.MSlice() defer pol.Recycle() cuda.AddZhangLiTorque(dst, M.Buffer(), msat, j, alpha, xi, pol, Mesh()) } if !DisableSlonczewskiTorque && !FixedLayer.isZero() { msat := Msat.MSlice() defer msat.Recycle() j := J.MSlice() defer j.Recycle() fixedP := FixedLayer.MSlice() defer fixedP.Recycle() alpha := Alpha.MSlice() defer alpha.Recycle() pol := Pol.MSlice() defer pol.Recycle() lambda := Lambda.MSlice() defer lambda.Recycle() epsPrime := EpsilonPrime.MSlice() defer epsPrime.Recycle() thickness := FreeLayerThickness.MSlice() defer thickness.Recycle() cuda.AddSlonczewskiTorque2(dst, M.Buffer(), msat, j, fixedP, alpha, pol, lambda, epsPrime, thickness, CurrentSignFromFixedLayerPosition[fixedLayerPosition], Mesh()) } } func FreezeSpins(dst *data.Slice) { if !FrozenSpins.isZero() { cuda.ZeroMask(dst, FrozenSpins.gpuLUT1(), regions.Gpu()) } } func GetMaxTorque() float64 { torque := ValueOf(Torque) defer cuda.Recycle(torque) return cuda.MaxVecNorm(torque) } type FixedLayerPosition int const ( FIXEDLAYER_TOP FixedLayerPosition = iota + 1 FIXEDLAYER_BOTTOM ) var ( CurrentSignFromFixedLayerPosition = map[FixedLayerPosition]float64{ FIXEDLAYER_TOP: 1.0, FIXEDLAYER_BOTTOM: -1.0, } ) type flposition struct{} func (*flposition) Eval() interface{} { return fixedLayerPosition } func (*flposition) SetValue(v interface{}) { drainOutput() fixedLayerPosition = v.(FixedLayerPosition) } func (*flposition) Type() reflect.Type { return reflect.TypeOf(FixedLayerPosition(FIXEDLAYER_TOP)) } 3-3.11.1/engine/unsafe.go000066400000000000000000000004311503346766200150260ustar00rootroot00000000000000package engine func init() { // There are no unsafe features since version 3.10, but we want maximal backwards compatibility DeclFunc("ext_EnableUnsafe", EnableUnsafe, "Deprecated. Only here to ensure maximal backwards compatibility with mumax3.9c.") } func EnableUnsafe() { } 3-3.11.1/engine/util.go000066400000000000000000000137411503346766200145320ustar00rootroot00000000000000package engine import ( "fmt" "math" "os" "path" "sort" "strings" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/dump" "github.com/mumax/3/httpfs" "github.com/mumax/3/mag" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) func init() { DeclFunc("Expect", Expect, "Used for automated tests: checks if a value is close enough to the expected value") DeclFunc("ExpectV", ExpectV, "Used for automated tests: checks if a vector is close enough to the expected value") DeclFunc("Fprintln", Fprintln, "Print to file") DeclFunc("Sign", sign, "Signum function") DeclFunc("Vector", Vector, "Constructs a vector with given components") DeclConst("Mu0", mag.Mu0, "Vacuum permeability (Tm/A)") DeclFunc("Print", myprint, "Print to standard output") DeclFunc("LoadFile", LoadFile, "Load a data file (ovf or dump)") DeclFunc("Index2Coord", Index2Coord, "Convert cell index to x,y,z coordinate in meter") DeclFunc("NewSlice", NewSlice, "Makes a 4D array with a specified number of components (first argument) "+ "and a specified size nx,ny,nz (remaining arguments)") DeclFunc("NewVectorMask", NewVectorMask, "Makes a 3D array of vectors") DeclFunc("NewScalarMask", NewScalarMask, "Makes a 3D array of scalars") } // Returns a new slice (3D array) with given number of components and size. func NewSlice(ncomp, Nx, Ny, Nz int) *data.Slice { return data.NewSlice(ncomp, [3]int{Nx, Ny, Nz}) } func NewVectorMask(Nx, Ny, Nz int) *data.Slice { return data.NewSlice(3, [3]int{Nx, Ny, Nz}) } func NewScalarMask(Nx, Ny, Nz int) *data.Slice { return data.NewSlice(1, [3]int{Nx, Ny, Nz}) } // Constructs a vector func Vector(x, y, z float64) data.Vector { return data.Vector{x, y, z} } // Test if have lies within want +/- maxError, // and print suited message. func Expect(msg string, have, want, maxError float64) { if math.IsNaN(have) || math.IsNaN(want) || math.Abs(have-want) > maxError { LogOut(msg, ":", " have: ", have, " want: ", want, "±", maxError) Close() os.Exit(1) } else { LogOut(msg, ":", have, "OK") } // note: we also check "want" for NaN in case "have" and "want" are switched. } func ExpectV(msg string, have, want data.Vector, maxErr float64) { for c := 0; c < 3; c++ { Expect(fmt.Sprintf("%v[%v]", msg, c), have[c], want[c], maxErr) } } // Append msg to file. Used to write aggregated output of many simulations in one file. func Fprintln(filename string, msg ...interface{}) { if !path.IsAbs(filename) { filename = OD() + filename } httpfs.Touch(filename) err := httpfs.Append(filename, []byte(fmt.Sprintln(myFmt(msg)...))) util.FatalErr(err) } // Read a magnetization state from .dump file. func LoadFile(fname string) *data.Slice { in, err := httpfs.Open(fname) util.FatalErr(err) var s *data.Slice if path.Ext(fname) == ".dump" { s, _, err = dump.Read(in) } else { s, _, err = oommf.Read(in) } util.FatalErr(err) return s } // Download a quantity to host, // or just return its data when already on host. func Download(q Quantity) *data.Slice { // TODO: optimize for Buffer() buf := ValueOf(q) defer cuda.Recycle(buf) if buf.CPUAccess() { return buf } else { return buf.HostCopy() } } // print with special formatting for some known types func myprint(msg ...interface{}) { LogOut(myFmt(msg)...) } // mumax specific formatting (Slice -> average, etc). func myFmt(msg []interface{}) []interface{} { for i, m := range msg { if e, ok := m.(*float64); ok { msg[i] = *e } // Tabledata: print average if m, ok := m.(Quantity); ok { str := fmt.Sprint(AverageOf(m)) msg[i] = str[1 : len(str)-1] // remove [ ] continue } } return msg } // converts cell index to coordinate, internal coordinates func Index2Coord(ix, iy, iz int) data.Vector { m := Mesh() n := m.Size() c := m.CellSize() x := c[X]*(float64(ix)-0.5*float64(n[X]-1)) - TotalShift y := c[Y]*(float64(iy)-0.5*float64(n[Y]-1)) - TotalYShift z := c[Z] * (float64(iz) - 0.5*float64(n[Z]-1)) return data.Vector{x, y, z} } func sign(x float64) float64 { switch { case x > 0: return 1 case x < 0: return -1 default: return 0 } } func sign32(x float32) float32 { switch { case x > 0: return 1 case x < 0: return -1 default: return 0 } } // returns a/b, or 0 when b == 0 func safediv(a, b float32) float32 { if b == 0 { return 0 } else { return a / b } } // dst = a/b, unless b == 0 func paramDiv(dst, a, b [][NREGION]float32) { util.Assert(len(dst) == 1 && len(a) == 1 && len(b) == 1) for i := 0; i < NREGION; i++ { // not regions.maxreg dst[0][i] = safediv(a[0][i], b[0][i]) } } // returns an array with the prime factors of n (n >= 1) func primeFactors(n int) (factors []int) { util.AssertMsg(n >= 1, "Can only determine prime factors of a positive integer.") for n%2 == 0 { // First, get all factors 2 factors = append(factors, 2) n = n / 2 } for i := 3; i*i <= n; i = i + 2 { // Since n is odd now, we can skip even divisors for n%i == 0 { // while i divides n, append i and divide n factors = append(factors, i) n = n / i } } if n > 1 || len(factors) == 0 { // Add any remaining factor factors = append(factors, n) } return } // shortcut for slicing unaddressable_vector()[:] func slice(v [3]float64) []float64 { return v[:] } func unslice(v []float64) [3]float64 { util.Assert(len(v) == 3) return [3]float64{v[0], v[1], v[2]} } func assureGPU(s *data.Slice) *data.Slice { if s.GPUAccess() { return s } else { return cuda.GPUCopy(s) } } type caseIndep []string func (s *caseIndep) Len() int { return len(*s) } func (s *caseIndep) Less(i, j int) bool { return strings.ToLower((*s)[i]) < strings.ToLower((*s)[j]) } func (s *caseIndep) Swap(i, j int) { (*s)[i], (*s)[j] = (*s)[j], (*s)[i] } func sortNoCase(s []string) { i := caseIndep(s) sort.Sort(&i) } func checkNaN1(x float64) { if math.IsNaN(x) { panic("NaN") } } // trim trailing newlines func rmln(a string) string { for strings.HasSuffix(a, "\n") { a = a[:len(a)-1] } return a } const ( X = 0 Y = 1 Z = 2 ) const ( SCALAR = 1 VECTOR = 3 ) 3-3.11.1/engine/zeeman.go000066400000000000000000000007331503346766200150310ustar00rootroot00000000000000package engine var ( B_ext = NewExcitation("B_ext", "T", "Externally applied field") Edens_zeeman = NewScalarField("Edens_Zeeman", "J/m3", "Zeeman energy density", AddEdens_zeeman) E_Zeeman = NewScalarValue("E_Zeeman", "J", "Zeeman energy", GetZeemanEnergy) ) var AddEdens_zeeman = makeEdensAdder(B_ext, -1) func init() { registerEnergy(GetZeemanEnergy, AddEdens_zeeman) } func GetZeemanEnergy() float64 { return -1 * cellVolume() * dot(&M_full, B_ext) } 3-3.11.1/freetype/000077500000000000000000000000001503346766200135765ustar00rootroot000000000000003-3.11.1/freetype/AUTHORS000066400000000000000000000011401503346766200146420ustar00rootroot00000000000000# This is the official list of Freetype-Go authors for copyright purposes. # This file is distinct from the CONTRIBUTORS files. # See the latter for an explanation. # # Freetype-Go is derived from Freetype, which is written in C. The latter # is copyright 1996-2010 David Turner, Robert Wilhelm, and Werner Lemberg. # Names should be added to this file as # Name or Organization # The email address is not required for organizations. # Please keep the list sorted. Google Inc. Jeff R. Allen Rémy Oudompheng Roger Peppe 3-3.11.1/freetype/CONTRIBUTORS000066400000000000000000000025771503346766200154710ustar00rootroot00000000000000# This is the official list of people who can contribute # (and typically have contributed) code to the Freetype-Go repository. # The AUTHORS file lists the copyright holders; this file # lists people. For example, Google employees are listed here # but not in AUTHORS, because Google holds the copyright. # # The submission process automatically checks to make sure # that people submitting code are listed in this file (by email address). # # Names should be added to this file only after verifying that # the individual or the individual's organization has agreed to # the appropriate Contributor License Agreement, found here: # # http://code.google.com/legal/individual-cla-v1.0.html # http://code.google.com/legal/corporate-cla-v1.0.html # # The agreement for individuals can be filled out on the web. # # When adding J Random Contributor's name to this file, # either J's name or J's organization's name should be # added to the AUTHORS file, depending on whether the # individual or corporate CLA was used. # Names should be added to this file like so: # Name # Please keep the list sorted. Andrew Gerrand Jeff R. Allen Nigel Tao Rémy Oudompheng Rob Pike Roger Peppe Russ Cox 3-3.11.1/freetype/LICENSE000066400000000000000000000010651503346766200146050ustar00rootroot00000000000000Use of the Freetype-Go software is subject to your choice of exactly one of the following two licenses: * The FreeType License, which is similar to the original BSD license with an advertising clause, or * The GNU General Public License (GPL), version 2 or later. The text of these licenses are available in the licenses/ftl.txt and the licenses/gpl.txt files respectively. They are also available at http://freetype.sourceforge.net/license.html The Luxi fonts in the testdata directory are licensed separately. See the testdata/COPYING file for details. 3-3.11.1/freetype/README000066400000000000000000000016041503346766200144570ustar00rootroot00000000000000This is a port of the Freetype font rasterizer (www.freetype.org) to the Go programming language (golang.org). To download and install from source: $ go get code.google.com/p/freetype-go/freetype It is an incomplete port: * It only supports TrueType fonts, and not Type 1 fonts nor bitmap fonts. * It only supports the Unicode encoding. There are also some implementation differences: * It uses a 24.8 fixed point co-ordinate system everywhere internally, as opposed to the original Freetype's mix of 26.6 (or 10.6 for 16-bit systems) in some places, and 24.8 in the "smooth" rasterizer. Freetype-Go is derived from Freetype, which is written in C. Freetype is copyright 1996-2010 David Turner, Robert Wilhelm, and Werner Lemberg. Freetype-Go is copyright The Freetype-Go Authors, who are listed in the AUTHORS file. The Freetype-Go homepage is http://code.google.com/p/freetype-go/ 3-3.11.1/freetype/raster/000077500000000000000000000000001503346766200150765ustar00rootroot000000000000003-3.11.1/freetype/raster/geom.go000066400000000000000000000173531503346766200163650ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. package raster import ( "fmt" "math" ) // A Fix32 is a 24.8 fixed point number. type Fix32 int32 // A Fix64 is a 48.16 fixed point number. type Fix64 int64 // String returns a human-readable representation of a 24.8 fixed point number. // For example, the number one-and-a-quarter becomes "1:064". func (x Fix32) String() string { if x < 0 { x = -x return fmt.Sprintf("-%d:%03d", int32(x/256), int32(x%256)) } return fmt.Sprintf("%d:%03d", int32(x/256), int32(x%256)) } // String returns a human-readable representation of a 48.16 fixed point number. // For example, the number one-and-a-quarter becomes "1:16384". func (x Fix64) String() string { if x < 0 { x = -x return fmt.Sprintf("-%d:%05d", int64(x/65536), int64(x%65536)) } return fmt.Sprintf("%d:%05d", int64(x/65536), int64(x%65536)) } // maxAbs returns the maximum of abs(a) and abs(b). func maxAbs(a, b Fix32) Fix32 { if a < 0 { a = -a } if b < 0 { b = -b } if a < b { return b } return a } // A Point represents a two-dimensional point or vector, in 24.8 fixed point // format. type Point struct { X, Y Fix32 } // String returns a human-readable representation of a Point. func (p Point) String() string { return "(" + p.X.String() + ", " + p.Y.String() + ")" } // Add returns the vector p + q. func (p Point) Add(q Point) Point { return Point{p.X + q.X, p.Y + q.Y} } // Sub returns the vector p - q. func (p Point) Sub(q Point) Point { return Point{p.X - q.X, p.Y - q.Y} } // Mul returns the vector k * p. func (p Point) Mul(k Fix32) Point { return Point{p.X * k / 256, p.Y * k / 256} } // Neg returns the vector -p, or equivalently p rotated by 180 degrees. func (p Point) Neg() Point { return Point{-p.X, -p.Y} } // Dot returns the dot product p·q. func (p Point) Dot(q Point) Fix64 { px, py := int64(p.X), int64(p.Y) qx, qy := int64(q.X), int64(q.Y) return Fix64(px*qx + py*qy) } // Len returns the length of the vector p. func (p Point) Len() Fix32 { // TODO(nigeltao): use fixed point math. x := float64(p.X) y := float64(p.Y) return Fix32(math.Sqrt(x*x + y*y)) } // Norm returns the vector p normalized to the given length, or the zero Point // if p is degenerate. func (p Point) Norm(length Fix32) Point { d := p.Len() if d == 0 { return Point{0, 0} } s, t := int64(length), int64(d) x := int64(p.X) * s / t y := int64(p.Y) * s / t return Point{Fix32(x), Fix32(y)} } // Rot45CW returns the vector p rotated clockwise by 45 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot45CW is {1/√2, 1/√2}. func (p Point) Rot45CW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (+px - py) * 181 / 256 qy := (+px + py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // Rot90CW returns the vector p rotated clockwise by 90 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot90CW is {0, 1}. func (p Point) Rot90CW() Point { return Point{-p.Y, p.X} } // Rot135CW returns the vector p rotated clockwise by 135 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot135CW is {-1/√2, 1/√2}. func (p Point) Rot135CW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (-px - py) * 181 / 256 qy := (+px - py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // Rot45CCW returns the vector p rotated counter-clockwise by 45 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot45CCW is {1/√2, -1/√2}. func (p Point) Rot45CCW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (+px + py) * 181 / 256 qy := (-px + py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // Rot90CCW returns the vector p rotated counter-clockwise by 90 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot90CCW is {0, -1}. func (p Point) Rot90CCW() Point { return Point{p.Y, -p.X} } // Rot135CCW returns the vector p rotated counter-clockwise by 135 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot135CCW is {-1/√2, -1/√2}. func (p Point) Rot135CCW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (-px + py) * 181 / 256 qy := (-px - py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // An Adder accumulates points on a curve. type Adder interface { // Start starts a new curve at the given point. Start(a Point) // Add1 adds a linear segment to the current curve. Add1(b Point) // Add2 adds a quadratic segment to the current curve. Add2(b, c Point) // Add3 adds a cubic segment to the current curve. Add3(b, c, d Point) } // A Path is a sequence of curves, and a curve is a start point followed by a // sequence of linear, quadratic or cubic segments. type Path []Fix32 // String returns a human-readable representation of a Path. func (p Path) String() string { s := "" for i := 0; i < len(p); { if i != 0 { s += " " } switch p[i] { case 0: s += "S0" + fmt.Sprint([]Fix32(p[i+1:i+3])) i += 4 case 1: s += "A1" + fmt.Sprint([]Fix32(p[i+1:i+3])) i += 4 case 2: s += "A2" + fmt.Sprint([]Fix32(p[i+1:i+5])) i += 6 case 3: s += "A3" + fmt.Sprint([]Fix32(p[i+1:i+7])) i += 8 default: panic("freetype/raster: bad path") } } return s } // grow adds n elements to p. func (p *Path) grow(n int) { n += len(*p) if n > cap(*p) { old := *p *p = make([]Fix32, n, 2*n+8) copy(*p, old) return } *p = (*p)[0:n] } // Clear cancels any previous calls to p.Start or p.AddXxx. func (p *Path) Clear() { *p = (*p)[0:0] } // Start starts a new curve at the given point. func (p *Path) Start(a Point) { n := len(*p) p.grow(4) (*p)[n] = 0 (*p)[n+1] = a.X (*p)[n+2] = a.Y (*p)[n+3] = 0 } // Add1 adds a linear segment to the current curve. func (p *Path) Add1(b Point) { n := len(*p) p.grow(4) (*p)[n] = 1 (*p)[n+1] = b.X (*p)[n+2] = b.Y (*p)[n+3] = 1 } // Add2 adds a quadratic segment to the current curve. func (p *Path) Add2(b, c Point) { n := len(*p) p.grow(6) (*p)[n] = 2 (*p)[n+1] = b.X (*p)[n+2] = b.Y (*p)[n+3] = c.X (*p)[n+4] = c.Y (*p)[n+5] = 2 } // Add3 adds a cubic segment to the current curve. func (p *Path) Add3(b, c, d Point) { n := len(*p) p.grow(8) (*p)[n] = 3 (*p)[n+1] = b.X (*p)[n+2] = b.Y (*p)[n+3] = c.X (*p)[n+4] = c.Y (*p)[n+5] = d.X (*p)[n+6] = d.Y (*p)[n+7] = 3 } // AddPath adds the Path q to p. func (p *Path) AddPath(q Path) { n, m := len(*p), len(q) p.grow(m) copy((*p)[n:n+m], q) } // AddStroke adds a stroked Path. func (p *Path) AddStroke(q Path, width Fix32, cr Capper, jr Joiner) { Stroke(p, q, width, cr, jr) } // firstPoint returns the first point in a non-empty Path. func (p Path) firstPoint() Point { return Point{p[1], p[2]} } // lastPoint returns the last point in a non-empty Path. func (p Path) lastPoint() Point { return Point{p[len(p)-3], p[len(p)-2]} } // addPathReversed adds q reversed to p. // For example, if q consists of a linear segment from A to B followed by a // quadratic segment from B to C to D, then the values of q looks like: // index: 01234567890123 // value: 0AA01BB12CCDD2 // So, when adding q backwards to p, we want to Add2(C, B) followed by Add1(A). func addPathReversed(p Adder, q Path) { if len(q) == 0 { return } i := len(q) - 1 for { switch q[i] { case 0: return case 1: i -= 4 p.Add1(Point{q[i-2], q[i-1]}) case 2: i -= 6 p.Add2(Point{q[i+2], q[i+3]}, Point{q[i-2], q[i-1]}) case 3: i -= 8 p.Add3(Point{q[i+4], q[i+5]}, Point{q[i+2], q[i+3]}, Point{q[i-2], q[i-1]}) default: panic("freetype/raster: bad path") } } } 3-3.11.1/freetype/raster/paint.go000066400000000000000000000171451503346766200165500ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. package raster import ( "image" "image/color" "image/draw" "math" ) // A Span is a horizontal segment of pixels with constant alpha. X0 is an // inclusive bound and X1 is exclusive, the same as for slices. A fully // opaque Span has A == 1<<32 - 1. type Span struct { Y, X0, X1 int A uint32 } // A Painter knows how to paint a batch of Spans. Rasterization may involve // Painting multiple batches, and done will be true for the final batch. // The Spans' Y values are monotonically increasing during a rasterization. // Paint may use all of ss as scratch space during the call. type Painter interface { Paint(ss []Span, done bool) } // The PainterFunc type adapts an ordinary function to the Painter interface. type PainterFunc func(ss []Span, done bool) // Paint just delegates the call to f. func (f PainterFunc) Paint(ss []Span, done bool) { f(ss, done) } // An AlphaOverPainter is a Painter that paints Spans onto an image.Alpha // using the Over Porter-Duff composition operator. type AlphaOverPainter struct { Image *image.Alpha } // Paint satisfies the Painter interface by painting ss onto an image.Alpha. func (r AlphaOverPainter) Paint(ss []Span, done bool) { b := r.Image.Bounds() for _, s := range ss { if s.Y < b.Min.Y { continue } if s.Y >= b.Max.Y { return } if s.X0 < b.Min.X { s.X0 = b.Min.X } if s.X1 > b.Max.X { s.X1 = b.Max.X } if s.X0 >= s.X1 { continue } base := (s.Y-r.Image.Rect.Min.Y)*r.Image.Stride - r.Image.Rect.Min.X p := r.Image.Pix[base+s.X0 : base+s.X1] a := int(s.A >> 24) for i, c := range p { v := int(c) p[i] = uint8((v*255 + (255-v)*a) / 255) } } } // NewAlphaOverPainter creates a new AlphaOverPainter for the given image. func NewAlphaOverPainter(m *image.Alpha) AlphaOverPainter { return AlphaOverPainter{m} } // An AlphaSrcPainter is a Painter that paints Spans onto an image.Alpha // using the Src Porter-Duff composition operator. type AlphaSrcPainter struct { Image *image.Alpha } // Paint satisfies the Painter interface by painting ss onto an image.Alpha. func (r AlphaSrcPainter) Paint(ss []Span, done bool) { b := r.Image.Bounds() for _, s := range ss { if s.Y < b.Min.Y { continue } if s.Y >= b.Max.Y { return } if s.X0 < b.Min.X { s.X0 = b.Min.X } if s.X1 > b.Max.X { s.X1 = b.Max.X } if s.X0 >= s.X1 { continue } base := (s.Y-r.Image.Rect.Min.Y)*r.Image.Stride - r.Image.Rect.Min.X p := r.Image.Pix[base+s.X0 : base+s.X1] color := uint8(s.A >> 24) for i := range p { p[i] = color } } } // NewAlphaSrcPainter creates a new AlphaSrcPainter for the given image. func NewAlphaSrcPainter(m *image.Alpha) AlphaSrcPainter { return AlphaSrcPainter{m} } type RGBAPainter struct { // The image to compose onto. Image *image.RGBA // The Porter-Duff composition operator. Op draw.Op // The 16-bit color to paint the spans. cr, cg, cb, ca uint32 } // Paint satisfies the Painter interface by painting ss onto an image.RGBA. func (r *RGBAPainter) Paint(ss []Span, done bool) { b := r.Image.Bounds() for _, s := range ss { if s.Y < b.Min.Y { continue } if s.Y >= b.Max.Y { return } if s.X0 < b.Min.X { s.X0 = b.Min.X } if s.X1 > b.Max.X { s.X1 = b.Max.X } if s.X0 >= s.X1 { continue } // This code is similar to drawGlyphOver in $GOROOT/src/pkg/image/draw/draw.go. ma := s.A >> 16 const m = 1<<16 - 1 i0 := (s.Y-r.Image.Rect.Min.Y)*r.Image.Stride + (s.X0-r.Image.Rect.Min.X)*4 i1 := i0 + (s.X1-s.X0)*4 if r.Op == draw.Over { for i := i0; i < i1; i += 4 { dr := uint32(r.Image.Pix[i+0]) dg := uint32(r.Image.Pix[i+1]) db := uint32(r.Image.Pix[i+2]) da := uint32(r.Image.Pix[i+3]) a := (m - (r.ca * ma / m)) * 0x101 r.Image.Pix[i+0] = uint8((dr*a + r.cr*ma) / m >> 8) r.Image.Pix[i+1] = uint8((dg*a + r.cg*ma) / m >> 8) r.Image.Pix[i+2] = uint8((db*a + r.cb*ma) / m >> 8) r.Image.Pix[i+3] = uint8((da*a + r.ca*ma) / m >> 8) } } else { for i := i0; i < i1; i += 4 { r.Image.Pix[i+0] = uint8(r.cr * ma / m >> 8) r.Image.Pix[i+1] = uint8(r.cg * ma / m >> 8) r.Image.Pix[i+2] = uint8(r.cb * ma / m >> 8) r.Image.Pix[i+3] = uint8(r.ca * ma / m >> 8) } } } } // SetColor sets the color to paint the spans. func (r *RGBAPainter) SetColor(c color.Color) { r.cr, r.cg, r.cb, r.ca = c.RGBA() } // NewRGBAPainter creates a new RGBAPainter for the given image. func NewRGBAPainter(m *image.RGBA) *RGBAPainter { return &RGBAPainter{Image: m} } // A MonochromePainter wraps another Painter, quantizing each Span's alpha to // be either fully opaque or fully transparent. type MonochromePainter struct { Painter Painter y, x0, x1 int } // Paint delegates to the wrapped Painter after quantizing each Span's alpha // value and merging adjacent fully opaque Spans. func (m *MonochromePainter) Paint(ss []Span, done bool) { // We compact the ss slice, discarding any Spans whose alpha quantizes to zero. j := 0 for _, s := range ss { if s.A >= 1<<31 { if m.y == s.Y && m.x1 == s.X0 { m.x1 = s.X1 } else { ss[j] = Span{m.y, m.x0, m.x1, 1<<32 - 1} j++ m.y, m.x0, m.x1 = s.Y, s.X0, s.X1 } } } if done { // Flush the accumulated Span. finalSpan := Span{m.y, m.x0, m.x1, 1<<32 - 1} if j < len(ss) { ss[j] = finalSpan j++ m.Painter.Paint(ss[0:j], true) } else if j == len(ss) { m.Painter.Paint(ss, false) if cap(ss) > 0 { ss = ss[0:1] } else { ss = make([]Span, 1) } ss[0] = finalSpan m.Painter.Paint(ss, true) } else { panic("unreachable") } // Reset the accumulator, so that this Painter can be re-used. m.y, m.x0, m.x1 = 0, 0, 0 } else { m.Painter.Paint(ss[0:j], false) } } // NewMonochromePainter creates a new MonochromePainter that wraps the given // Painter. func NewMonochromePainter(p Painter) *MonochromePainter { return &MonochromePainter{Painter: p} } // A GammaCorrectionPainter wraps another Painter, performing gamma-correction // on each Span's alpha value. type GammaCorrectionPainter struct { // The wrapped Painter. Painter Painter // Precomputed alpha values for linear interpolation, with fully opaque == 1<<16-1. a [256]uint16 // Whether gamma correction is a no-op. gammaIsOne bool } // Paint delegates to the wrapped Painter after performing gamma-correction // on each Span. func (g *GammaCorrectionPainter) Paint(ss []Span, done bool) { if !g.gammaIsOne { const ( M = 0x1010101 // 255*M == 1<<32-1 N = 0x8080 // N = M>>9, and N < 1<<16-1 ) for i, _ := range ss { if ss[i].A == 0 || ss[i].A == 1<<32-1 { continue } p, q := ss[i].A/M, (ss[i].A%M)>>9 // The resultant alpha is a linear interpolation of g.a[p] and g.a[p+1]. a := uint32(g.a[p])*(N-q) + uint32(g.a[p+1])*q a = (a + N/2) / N // Convert the alpha from 16-bit (which is g.a's range) to 32-bit. a |= a << 16 ss[i].A = a } } g.Painter.Paint(ss, done) } // SetGamma sets the gamma value. func (g *GammaCorrectionPainter) SetGamma(gamma float64) { if gamma == 1.0 { g.gammaIsOne = true return } g.gammaIsOne = false for i := 0; i < 256; i++ { a := float64(i) / 0xff a = math.Pow(a, gamma) g.a[i] = uint16(0xffff * a) } } // NewGammaCorrectionPainter creates a new GammaCorrectionPainter that wraps // the given Painter. func NewGammaCorrectionPainter(p Painter, gamma float64) *GammaCorrectionPainter { g := &GammaCorrectionPainter{Painter: p} g.SetGamma(gamma) return g } 3-3.11.1/freetype/raster/raster.go000066400000000000000000000351321503346766200167310ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. // The raster package provides an anti-aliasing 2-D rasterizer. // // It is part of the larger Freetype-Go suite of font-related packages, // but the raster package is not specific to font rasterization, and can // be used standalone without any other Freetype-Go package. // // Rasterization is done by the same area/coverage accumulation algorithm // as the Freetype "smooth" module, and the Anti-Grain Geometry library. // A description of the area/coverage algorithm is at // http://projects.tuxee.net/cl-vectors/section-the-cl-aa-algorithm package raster import ( "strconv" ) // A cell is part of a linked list (for a given yi co-ordinate) of accumulated // area/coverage for the pixel at (xi, yi). type cell struct { xi int area, cover int next int } type Rasterizer struct { // If false, the default behavior is to use the even-odd winding fill // rule during Rasterize. UseNonZeroWinding bool // An offset (in pixels) to the painted spans. Dx, Dy int // The width of the Rasterizer. The height is implicit in len(cellIndex). width int // splitScaleN is the scaling factor used to determine how many times // to decompose a quadratic or cubic segment into a linear approximation. splitScale2, splitScale3 int // The current pen position. a Point // The current cell and its area/coverage being accumulated. xi, yi int area, cover int // Saved cells. cell []cell // Linked list of cells, one per row. cellIndex []int // Buffers. cellBuf [256]cell cellIndexBuf [64]int spanBuf [64]Span } // findCell returns the index in r.cell for the cell corresponding to // (r.xi, r.yi). The cell is created if necessary. func (r *Rasterizer) findCell() int { if r.yi < 0 || r.yi >= len(r.cellIndex) { return -1 } xi := r.xi if xi < 0 { xi = -1 } else if xi > r.width { xi = r.width } i, prev := r.cellIndex[r.yi], -1 for i != -1 && r.cell[i].xi <= xi { if r.cell[i].xi == xi { return i } i, prev = r.cell[i].next, i } c := len(r.cell) if c == cap(r.cell) { buf := make([]cell, c, 4*c) copy(buf, r.cell) r.cell = buf[0 : c+1] } else { r.cell = r.cell[0 : c+1] } r.cell[c] = cell{xi, 0, 0, i} if prev == -1 { r.cellIndex[r.yi] = c } else { r.cell[prev].next = c } return c } // saveCell saves any accumulated r.area/r.cover for (r.xi, r.yi). func (r *Rasterizer) saveCell() { if r.area != 0 || r.cover != 0 { i := r.findCell() if i != -1 { r.cell[i].area += r.area r.cell[i].cover += r.cover } r.area = 0 r.cover = 0 } } // setCell sets the (xi, yi) cell that r is accumulating area/coverage for. func (r *Rasterizer) setCell(xi, yi int) { if r.xi != xi || r.yi != yi { r.saveCell() r.xi, r.yi = xi, yi } } // scan accumulates area/coverage for the yi'th scanline, going from // x0 to x1 in the horizontal direction (in 24.8 fixed point co-ordinates) // and from y0f to y1f fractional vertical units within that scanline. func (r *Rasterizer) scan(yi int, x0, y0f, x1, y1f Fix32) { // Break the 24.8 fixed point X co-ordinates into integral and fractional parts. x0i := int(x0) / 256 x0f := x0 - Fix32(256*x0i) x1i := int(x1) / 256 x1f := x1 - Fix32(256*x1i) // A perfectly horizontal scan. if y0f == y1f { r.setCell(x1i, yi) return } dx, dy := x1-x0, y1f-y0f // A single cell scan. if x0i == x1i { r.area += int((x0f + x1f) * dy) r.cover += int(dy) return } // There are at least two cells. Apart from the first and last cells, // all intermediate cells go through the full width of the cell, // or 256 units in 24.8 fixed point format. var ( p, q, edge0, edge1 Fix32 xiDelta int ) if dx > 0 { p, q = (256-x0f)*dy, dx edge0, edge1, xiDelta = 0, 256, 1 } else { p, q = x0f*dy, -dx edge0, edge1, xiDelta = 256, 0, -1 } yDelta, yRem := p/q, p%q if yRem < 0 { yDelta -= 1 yRem += q } // Do the first cell. xi, y := x0i, y0f r.area += int((x0f + edge1) * yDelta) r.cover += int(yDelta) xi, y = xi+xiDelta, y+yDelta r.setCell(xi, yi) if xi != x1i { // Do all the intermediate cells. p = 256 * (y1f - y + yDelta) fullDelta, fullRem := p/q, p%q if fullRem < 0 { fullDelta -= 1 fullRem += q } yRem -= q for xi != x1i { yDelta = fullDelta yRem += fullRem if yRem >= 0 { yDelta += 1 yRem -= q } r.area += int(256 * yDelta) r.cover += int(yDelta) xi, y = xi+xiDelta, y+yDelta r.setCell(xi, yi) } } // Do the last cell. yDelta = y1f - y r.area += int((edge0 + x1f) * yDelta) r.cover += int(yDelta) } // Start starts a new curve at the given point. func (r *Rasterizer) Start(a Point) { r.setCell(int(a.X/256), int(a.Y/256)) r.a = a } // Add1 adds a linear segment to the current curve. func (r *Rasterizer) Add1(b Point) { x0, y0 := r.a.X, r.a.Y x1, y1 := b.X, b.Y dx, dy := x1-x0, y1-y0 // Break the 24.8 fixed point Y co-ordinates into integral and fractional parts. y0i := int(y0) / 256 y0f := y0 - Fix32(256*y0i) y1i := int(y1) / 256 y1f := y1 - Fix32(256*y1i) if y0i == y1i { // There is only one scanline. r.scan(y0i, x0, y0f, x1, y1f) } else if dx == 0 { // This is a vertical line segment. We avoid calling r.scan and instead // manipulate r.area and r.cover directly. var ( edge0, edge1 Fix32 yiDelta int ) if dy > 0 { edge0, edge1, yiDelta = 0, 256, 1 } else { edge0, edge1, yiDelta = 256, 0, -1 } x0i, yi := int(x0)/256, y0i x0fTimes2 := (int(x0) - (256 * x0i)) * 2 // Do the first pixel. dcover := int(edge1 - y0f) darea := int(x0fTimes2 * dcover) r.area += darea r.cover += dcover yi += yiDelta r.setCell(x0i, yi) // Do all the intermediate pixels. dcover = int(edge1 - edge0) darea = int(x0fTimes2 * dcover) for yi != y1i { r.area += darea r.cover += dcover yi += yiDelta r.setCell(x0i, yi) } // Do the last pixel. dcover = int(y1f - edge0) darea = int(x0fTimes2 * dcover) r.area += darea r.cover += dcover } else { // There are at least two scanlines. Apart from the first and last scanlines, // all intermediate scanlines go through the full height of the row, or 256 // units in 24.8 fixed point format. var ( p, q, edge0, edge1 Fix32 yiDelta int ) if dy > 0 { p, q = (256-y0f)*dx, dy edge0, edge1, yiDelta = 0, 256, 1 } else { p, q = y0f*dx, -dy edge0, edge1, yiDelta = 256, 0, -1 } xDelta, xRem := p/q, p%q if xRem < 0 { xDelta -= 1 xRem += q } // Do the first scanline. x, yi := x0, y0i r.scan(yi, x, y0f, x+xDelta, edge1) x, yi = x+xDelta, yi+yiDelta r.setCell(int(x)/256, yi) if yi != y1i { // Do all the intermediate scanlines. p = 256 * dx fullDelta, fullRem := p/q, p%q if fullRem < 0 { fullDelta -= 1 fullRem += q } xRem -= q for yi != y1i { xDelta = fullDelta xRem += fullRem if xRem >= 0 { xDelta += 1 xRem -= q } r.scan(yi, x, edge0, x+xDelta, edge1) x, yi = x+xDelta, yi+yiDelta r.setCell(int(x)/256, yi) } } // Do the last scanline. r.scan(yi, x, edge0, x1, y1f) } // The next lineTo starts from b. r.a = b } // Add2 adds a quadratic segment to the current curve. func (r *Rasterizer) Add2(b, c Point) { // Calculate nSplit (the number of recursive decompositions) based on how `curvy' it is. // Specifically, how much the middle point b deviates from (a+c)/2. dev := maxAbs(r.a.X-2*b.X+c.X, r.a.Y-2*b.Y+c.Y) / Fix32(r.splitScale2) nsplit := 0 for dev > 0 { dev /= 4 nsplit++ } // dev is 32-bit, and nsplit++ every time we shift off 2 bits, so maxNsplit is 16. const maxNsplit = 16 if nsplit > maxNsplit { panic("freetype/raster: Add2 nsplit too large: " + strconv.Itoa(nsplit)) } // Recursively decompose the curve nSplit levels deep. var ( pStack [2*maxNsplit + 3]Point sStack [maxNsplit + 1]int i int ) sStack[0] = nsplit pStack[0] = c pStack[1] = b pStack[2] = r.a for i >= 0 { s := sStack[i] p := pStack[2*i:] if s > 0 { // Split the quadratic curve p[0:3] into an equivalent set of two shorter curves: // p[0:3] and p[2:5]. The new p[4] is the old p[2], and p[0] is unchanged. mx := p[1].X p[4].X = p[2].X p[3].X = (p[4].X + mx) / 2 p[1].X = (p[0].X + mx) / 2 p[2].X = (p[1].X + p[3].X) / 2 my := p[1].Y p[4].Y = p[2].Y p[3].Y = (p[4].Y + my) / 2 p[1].Y = (p[0].Y + my) / 2 p[2].Y = (p[1].Y + p[3].Y) / 2 // The two shorter curves have one less split to do. sStack[i] = s - 1 sStack[i+1] = s - 1 i++ } else { // Replace the level-0 quadratic with a two-linear-piece approximation. midx := (p[0].X + 2*p[1].X + p[2].X) / 4 midy := (p[0].Y + 2*p[1].Y + p[2].Y) / 4 r.Add1(Point{midx, midy}) r.Add1(p[0]) i-- } } } // Add3 adds a cubic segment to the current curve. func (r *Rasterizer) Add3(b, c, d Point) { // Calculate nSplit (the number of recursive decompositions) based on how `curvy' it is. dev2 := maxAbs(r.a.X-3*(b.X+c.X)+d.X, r.a.Y-3*(b.Y+c.Y)+d.Y) / Fix32(r.splitScale2) dev3 := maxAbs(r.a.X-2*b.X+d.X, r.a.Y-2*b.Y+d.Y) / Fix32(r.splitScale3) nsplit := 0 for dev2 > 0 || dev3 > 0 { dev2 /= 8 dev3 /= 4 nsplit++ } // devN is 32-bit, and nsplit++ every time we shift off 2 bits, so maxNsplit is 16. const maxNsplit = 16 if nsplit > maxNsplit { panic("freetype/raster: Add3 nsplit too large: " + strconv.Itoa(nsplit)) } // Recursively decompose the curve nSplit levels deep. var ( pStack [3*maxNsplit + 4]Point sStack [maxNsplit + 1]int i int ) sStack[0] = nsplit pStack[0] = d pStack[1] = c pStack[2] = b pStack[3] = r.a for i >= 0 { s := sStack[i] p := pStack[3*i:] if s > 0 { // Split the cubic curve p[0:4] into an equivalent set of two shorter curves: // p[0:4] and p[3:7]. The new p[6] is the old p[3], and p[0] is unchanged. m01x := (p[0].X + p[1].X) / 2 m12x := (p[1].X + p[2].X) / 2 m23x := (p[2].X + p[3].X) / 2 p[6].X = p[3].X p[5].X = m23x p[1].X = m01x p[2].X = (m01x + m12x) / 2 p[4].X = (m12x + m23x) / 2 p[3].X = (p[2].X + p[4].X) / 2 m01y := (p[0].Y + p[1].Y) / 2 m12y := (p[1].Y + p[2].Y) / 2 m23y := (p[2].Y + p[3].Y) / 2 p[6].Y = p[3].Y p[5].Y = m23y p[1].Y = m01y p[2].Y = (m01y + m12y) / 2 p[4].Y = (m12y + m23y) / 2 p[3].Y = (p[2].Y + p[4].Y) / 2 // The two shorter curves have one less split to do. sStack[i] = s - 1 sStack[i+1] = s - 1 i++ } else { // Replace the level-0 cubic with a two-linear-piece approximation. midx := (p[0].X + 3*(p[1].X+p[2].X) + p[3].X) / 8 midy := (p[0].Y + 3*(p[1].Y+p[2].Y) + p[3].Y) / 8 r.Add1(Point{midx, midy}) r.Add1(p[0]) i-- } } } // AddPath adds the given Path. func (r *Rasterizer) AddPath(p Path) { for i := 0; i < len(p); { switch p[i] { case 0: r.Start(Point{p[i+1], p[i+2]}) i += 4 case 1: r.Add1(Point{p[i+1], p[i+2]}) i += 4 case 2: r.Add2(Point{p[i+1], p[i+2]}, Point{p[i+3], p[i+4]}) i += 6 case 3: r.Add3(Point{p[i+1], p[i+2]}, Point{p[i+3], p[i+4]}, Point{p[i+5], p[i+6]}) i += 8 default: panic("freetype/raster: bad path") } } } // AddStroke adds a stroked Path. func (r *Rasterizer) AddStroke(q Path, width Fix32, cr Capper, jr Joiner) { Stroke(r, q, width, cr, jr) } // Converts an area value to a uint32 alpha value. A completely filled pixel // corresponds to an area of 256*256*2, and an alpha of 1<<32-1. The // conversion of area values greater than this depends on the winding rule: // even-odd or non-zero. func (r *Rasterizer) areaToAlpha(area int) uint32 { // The C Freetype implementation (version 2.3.12) does "alpha := area>>1" without // the +1. Round-to-nearest gives a more symmetric result than round-down. // The C implementation also returns 8-bit alpha, not 32-bit alpha. a := (area + 1) >> 1 if a < 0 { a = -a } alpha := uint32(a) if r.UseNonZeroWinding { if alpha > 0xffff { alpha = 0xffff } } else { alpha &= 0x1ffff if alpha > 0x10000 { alpha = 0x20000 - alpha } else if alpha == 0x10000 { alpha = 0x0ffff } } alpha |= alpha << 16 return alpha } // Rasterize converts r's accumulated curves into Spans for p. The Spans // passed to p are non-overlapping, and sorted by Y and then X. They all // have non-zero width (and 0 <= X0 < X1 <= r.width) and non-zero A, except // for the final Span, which has Y, X0, X1 and A all equal to zero. func (r *Rasterizer) Rasterize(p Painter) { r.saveCell() s := 0 for yi := 0; yi < len(r.cellIndex); yi++ { xi, cover := 0, 0 for c := r.cellIndex[yi]; c != -1; c = r.cell[c].next { if cover != 0 && r.cell[c].xi > xi { alpha := r.areaToAlpha(cover * 256 * 2) if alpha != 0 { xi0, xi1 := xi, r.cell[c].xi if xi0 < 0 { xi0 = 0 } if xi1 >= r.width { xi1 = r.width } if xi0 < xi1 { r.spanBuf[s] = Span{yi + r.Dy, xi0 + r.Dx, xi1 + r.Dx, alpha} s++ } } } cover += r.cell[c].cover alpha := r.areaToAlpha(cover*256*2 - r.cell[c].area) xi = r.cell[c].xi + 1 if alpha != 0 { xi0, xi1 := r.cell[c].xi, xi if xi0 < 0 { xi0 = 0 } if xi1 >= r.width { xi1 = r.width } if xi0 < xi1 { r.spanBuf[s] = Span{yi + r.Dy, xi0 + r.Dx, xi1 + r.Dx, alpha} s++ } } if s > len(r.spanBuf)-2 { p.Paint(r.spanBuf[0:s], false) s = 0 } } } p.Paint(r.spanBuf[0:s], true) } // Clear cancels any previous calls to r.Start or r.AddXxx. func (r *Rasterizer) Clear() { r.a = Point{0, 0} r.xi = 0 r.yi = 0 r.area = 0 r.cover = 0 r.cell = r.cell[0:0] for i := 0; i < len(r.cellIndex); i++ { r.cellIndex[i] = -1 } } // SetBounds sets the maximum width and height of the rasterized image and // calls Clear. The width and height are in pixels, not Fix32 units. func (r *Rasterizer) SetBounds(width, height int) { if width < 0 { width = 0 } if height < 0 { height = 0 } // Use the same ssN heuristic as the C Freetype implementation. // The C implementation uses the values 32, 16, but those are in // 26.6 fixed point units, and we use 24.8 fixed point everywhere. ss2, ss3 := 128, 64 if width > 24 || height > 24 { ss2, ss3 = 2*ss2, 2*ss3 if width > 120 || height > 120 { ss2, ss3 = 2*ss2, 2*ss3 } } r.width = width r.splitScale2 = ss2 r.splitScale3 = ss3 r.cell = r.cellBuf[0:0] if height > len(r.cellIndexBuf) { r.cellIndex = make([]int, height) } else { r.cellIndex = r.cellIndexBuf[0:height] } r.Clear() } // NewRasterizer creates a new Rasterizer with the given bounds. func NewRasterizer(width, height int) *Rasterizer { r := new(Rasterizer) r.SetBounds(width, height) return r } 3-3.11.1/freetype/raster/stroke.go000066400000000000000000000341711503346766200167420ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. package raster // Two points are considered practically equal if the square of the distance // between them is less than one quarter (i.e. 16384 / 65536 in Fix64). const epsilon = 16384 // A Capper signifies how to begin or end a stroked path. type Capper interface { // Cap adds a cap to p given a pivot point and the normal vector of a // terminal segment. The normal's length is half of the stroke width. Cap(p Adder, halfWidth Fix32, pivot, n1 Point) } // The CapperFunc type adapts an ordinary function to be a Capper. type CapperFunc func(Adder, Fix32, Point, Point) func (f CapperFunc) Cap(p Adder, halfWidth Fix32, pivot, n1 Point) { f(p, halfWidth, pivot, n1) } // A Joiner signifies how to join interior nodes of a stroked path. type Joiner interface { // Join adds a join to the two sides of a stroked path given a pivot // point and the normal vectors of the trailing and leading segments. // Both normals have length equal to half of the stroke width. Join(lhs, rhs Adder, halfWidth Fix32, pivot, n0, n1 Point) } // The JoinerFunc type adapts an ordinary function to be a Joiner. type JoinerFunc func(lhs, rhs Adder, halfWidth Fix32, pivot, n0, n1 Point) func (f JoinerFunc) Join(lhs, rhs Adder, halfWidth Fix32, pivot, n0, n1 Point) { f(lhs, rhs, halfWidth, pivot, n0, n1) } // RoundCapper adds round caps to a stroked path. var RoundCapper Capper = CapperFunc(roundCapper) func roundCapper(p Adder, halfWidth Fix32, pivot, n1 Point) { // The cubic Bézier approximation to a circle involves the magic number // (√2 - 1) * 4/3, which is approximately 141/256. const k = 141 e := n1.Rot90CCW() side := pivot.Add(e) start, end := pivot.Sub(n1), pivot.Add(n1) d, e := n1.Mul(k), e.Mul(k) p.Add3(start.Add(e), side.Sub(d), side) p.Add3(side.Add(d), end.Add(e), end) } // ButtCapper adds butt caps to a stroked path. var ButtCapper Capper = CapperFunc(buttCapper) func buttCapper(p Adder, halfWidth Fix32, pivot, n1 Point) { p.Add1(pivot.Add(n1)) } // SquareCapper adds square caps to a stroked path. var SquareCapper Capper = CapperFunc(squareCapper) func squareCapper(p Adder, halfWidth Fix32, pivot, n1 Point) { e := n1.Rot90CCW() side := pivot.Add(e) p.Add1(side.Sub(n1)) p.Add1(side.Add(n1)) p.Add1(pivot.Add(n1)) } // RoundJoiner adds round joins to a stroked path. var RoundJoiner Joiner = JoinerFunc(roundJoiner) func roundJoiner(lhs, rhs Adder, haflWidth Fix32, pivot, n0, n1 Point) { dot := n0.Rot90CW().Dot(n1) if dot >= 0 { addArc(lhs, pivot, n0, n1) rhs.Add1(pivot.Sub(n1)) } else { lhs.Add1(pivot.Add(n1)) addArc(rhs, pivot, n0.Neg(), n1.Neg()) } } // BevelJoiner adds bevel joins to a stroked path. var BevelJoiner Joiner = JoinerFunc(bevelJoiner) func bevelJoiner(lhs, rhs Adder, haflWidth Fix32, pivot, n0, n1 Point) { lhs.Add1(pivot.Add(n1)) rhs.Add1(pivot.Sub(n1)) } // addArc adds a circular arc from pivot+n0 to pivot+n1 to p. The shorter of // the two possible arcs is taken, i.e. the one spanning <= 180 degrees. // The two vectors n0 and n1 must be of equal length. func addArc(p Adder, pivot, n0, n1 Point) { // r2 is the square of the length of n0. r2 := n0.Dot(n0) if r2 < epsilon { // The arc radius is so small that we collapse to a straight line. p.Add1(pivot.Add(n1)) return } // We approximate the arc by 0, 1, 2 or 3 45-degree quadratic segments plus // a final quadratic segment from s to n1. Each 45-degree segment has control // points {1, 0}, {1, tan(π/8)} and {1/√2, 1/√2} suitably scaled, rotated and // translated. tan(π/8) is approximately 106/256. const tpo8 = 106 var s Point // We determine which octant the angle between n0 and n1 is in via three dot products. // m0, m1 and m2 are n0 rotated clockwise by 45, 90 and 135 degrees. m0 := n0.Rot45CW() m1 := n0.Rot90CW() m2 := m0.Rot90CW() if m1.Dot(n1) >= 0 { if n0.Dot(n1) >= 0 { if m2.Dot(n1) <= 0 { // n1 is between 0 and 45 degrees clockwise of n0. s = n0 } else { // n1 is between 45 and 90 degrees clockwise of n0. p.Add2(pivot.Add(n0).Add(m1.Mul(tpo8)), pivot.Add(m0)) s = m0 } } else { pm1, n0t := pivot.Add(m1), n0.Mul(tpo8) p.Add2(pivot.Add(n0).Add(m1.Mul(tpo8)), pivot.Add(m0)) p.Add2(pm1.Add(n0t), pm1) if m0.Dot(n1) >= 0 { // n1 is between 90 and 135 degrees clockwise of n0. s = m1 } else { // n1 is between 135 and 180 degrees clockwise of n0. p.Add2(pm1.Sub(n0t), pivot.Add(m2)) s = m2 } } } else { if n0.Dot(n1) >= 0 { if m0.Dot(n1) >= 0 { // n1 is between 0 and 45 degrees counter-clockwise of n0. s = n0 } else { // n1 is between 45 and 90 degrees counter-clockwise of n0. p.Add2(pivot.Add(n0).Sub(m1.Mul(tpo8)), pivot.Sub(m2)) s = m2.Neg() } } else { pm1, n0t := pivot.Sub(m1), n0.Mul(tpo8) p.Add2(pivot.Add(n0).Sub(m1.Mul(tpo8)), pivot.Sub(m2)) p.Add2(pm1.Add(n0t), pm1) if m2.Dot(n1) <= 0 { // n1 is between 90 and 135 degrees counter-clockwise of n0. s = m1.Neg() } else { // n1 is between 135 and 180 degrees counter-clockwise of n0. p.Add2(pm1.Sub(n0t), pivot.Sub(m0)) s = m0.Neg() } } } // The final quadratic segment has two endpoints s and n1 and the middle // control point is a multiple of s.Add(n1), i.e. it is on the angle bisector // of those two points. The multiple ranges between 128/256 and 150/256 as // the angle between s and n1 ranges between 0 and 45 degrees. // When the angle is 0 degrees (i.e. s and n1 are coincident) then s.Add(n1) // is twice s and so the middle control point of the degenerate quadratic // segment should be half s.Add(n1), and half = 128/256. // When the angle is 45 degrees then 150/256 is the ratio of the lengths of // the two vectors {1, tan(π/8)} and {1 + 1/√2, 1/√2}. // d is the normalized dot product between s and n1. Since the angle ranges // between 0 and 45 degrees then d ranges between 256/256 and 181/256. d := 256 * s.Dot(n1) / r2 multiple := Fix32(150 - 22*(d-181)/(256-181)) p.Add2(pivot.Add(s.Add(n1).Mul(multiple)), pivot.Add(n1)) } // midpoint returns the midpoint of two Points. func midpoint(a, b Point) Point { return Point{(a.X + b.X) / 2, (a.Y + b.Y) / 2} } // angleGreaterThan45 returns whether the angle between two vectors is more // than 45 degrees. func angleGreaterThan45(v0, v1 Point) bool { v := v0.Rot45CCW() return v.Dot(v1) < 0 || v.Rot90CW().Dot(v1) < 0 } // interpolate returns the point (1-t)*a + t*b. func interpolate(a, b Point, t Fix64) Point { s := 65536 - t x := s*Fix64(a.X) + t*Fix64(b.X) y := s*Fix64(a.Y) + t*Fix64(b.Y) return Point{Fix32(x >> 16), Fix32(y >> 16)} } // curviest2 returns the value of t for which the quadratic parametric curve // (1-t)²*a + 2*t*(1-t).b + t²*c has maximum curvature. // // The curvature of the parametric curve f(t) = (x(t), y(t)) is // |x′y″-y′x″| / (x′²+y′²)^(3/2). // // Let d = b-a and e = c-2*b+a, so that f′(t) = 2*d+2*e*t and f″(t) = 2*e. // The curvature's numerator is (2*dx+2*ex*t)*(2*ey)-(2*dy+2*ey*t)*(2*ex), // which simplifies to 4*dx*ey-4*dy*ex, which is constant with respect to t. // // Thus, curvature is extreme where the denominator is extreme, i.e. where // (x′²+y′²) is extreme. The first order condition is that // 2*x′*x″+2*y′*y″ = 0, or (dx+ex*t)*ex + (dy+ey*t)*ey = 0. // Solving for t gives t = -(dx*ex+dy*ey) / (ex*ex+ey*ey). func curviest2(a, b, c Point) Fix64 { dx := int64(b.X - a.X) dy := int64(b.Y - a.Y) ex := int64(c.X - 2*b.X + a.X) ey := int64(c.Y - 2*b.Y + a.Y) if ex == 0 && ey == 0 { return 32768 } return Fix64(-65536 * (dx*ex + dy*ey) / (ex*ex + ey*ey)) } // A stroker holds state for stroking a path. type stroker struct { // p is the destination that records the stroked path. p Adder // u is the half-width of the stroke. u Fix32 // cr and jr specify how to end and connect path segments. cr Capper jr Joiner // r is the reverse path. Stroking a path involves constructing two // parallel paths 2*u apart. The first path is added immediately to p, // the second path is accumulated in r and eventually added in reverse. r Path // a is the most recent segment point. anorm is the segment normal of // length u at that point. a, anorm Point } // addNonCurvy2 adds a quadratic segment to the stroker, where the segment // defined by (k.a, b, c) achieves maximum curvature at either k.a or c. func (k *stroker) addNonCurvy2(b, c Point) { // We repeatedly divide the segment at its middle until it is straight // enough to approximate the stroke by just translating the control points. // ds and ps are stacks of depths and points. t is the top of the stack. const maxDepth = 5 var ( ds [maxDepth + 1]int ps [2*maxDepth + 3]Point t int ) // Initially the ps stack has one quadratic segment of depth zero. ds[0] = 0 ps[2] = k.a ps[1] = b ps[0] = c anorm := k.anorm var cnorm Point for { depth := ds[t] a := ps[2*t+2] b := ps[2*t+1] c := ps[2*t+0] ab := b.Sub(a) bc := c.Sub(b) abIsSmall := ab.Dot(ab) < Fix64(1<<16) bcIsSmall := bc.Dot(bc) < Fix64(1<<16) if abIsSmall && bcIsSmall { // Approximate the segment by a circular arc. cnorm = bc.Norm(k.u).Rot90CCW() mac := midpoint(a, c) addArc(k.p, mac, anorm, cnorm) addArc(&k.r, mac, anorm.Neg(), cnorm.Neg()) } else if depth < maxDepth && angleGreaterThan45(ab, bc) { // Divide the segment in two and push both halves on the stack. mab := midpoint(a, b) mbc := midpoint(b, c) t++ ds[t+0] = depth + 1 ds[t-1] = depth + 1 ps[2*t+2] = a ps[2*t+1] = mab ps[2*t+0] = midpoint(mab, mbc) ps[2*t-1] = mbc continue } else { // Translate the control points. bnorm := c.Sub(a).Norm(k.u).Rot90CCW() cnorm = bc.Norm(k.u).Rot90CCW() k.p.Add2(b.Add(bnorm), c.Add(cnorm)) k.r.Add2(b.Sub(bnorm), c.Sub(cnorm)) } if t == 0 { k.a, k.anorm = c, cnorm return } t-- anorm = cnorm } panic("unreachable") } // Add1 adds a linear segment to the stroker. func (k *stroker) Add1(b Point) { bnorm := b.Sub(k.a).Norm(k.u).Rot90CCW() if len(k.r) == 0 { k.p.Start(k.a.Add(bnorm)) k.r.Start(k.a.Sub(bnorm)) } else { k.jr.Join(k.p, &k.r, k.u, k.a, k.anorm, bnorm) } k.p.Add1(b.Add(bnorm)) k.r.Add1(b.Sub(bnorm)) k.a, k.anorm = b, bnorm } // Add2 adds a quadratic segment to the stroker. func (k *stroker) Add2(b, c Point) { ab := b.Sub(k.a) bc := c.Sub(b) abnorm := ab.Norm(k.u).Rot90CCW() if len(k.r) == 0 { k.p.Start(k.a.Add(abnorm)) k.r.Start(k.a.Sub(abnorm)) } else { k.jr.Join(k.p, &k.r, k.u, k.a, k.anorm, abnorm) } // Approximate nearly-degenerate quadratics by linear segments. abIsSmall := ab.Dot(ab) < epsilon bcIsSmall := bc.Dot(bc) < epsilon if abIsSmall || bcIsSmall { acnorm := c.Sub(k.a).Norm(k.u).Rot90CCW() k.p.Add1(c.Add(acnorm)) k.r.Add1(c.Sub(acnorm)) k.a, k.anorm = c, acnorm return } // The quadratic segment (k.a, b, c) has a point of maximum curvature. // If this occurs at an end point, we process the segment as a whole. t := curviest2(k.a, b, c) if t <= 0 || t >= 65536 { k.addNonCurvy2(b, c) return } // Otherwise, we perform a de Casteljau decomposition at the point of // maximum curvature and process the two straighter parts. mab := interpolate(k.a, b, t) mbc := interpolate(b, c, t) mabc := interpolate(mab, mbc, t) // If the vectors ab and bc are close to being in opposite directions, // then the decomposition can become unstable, so we approximate the // quadratic segment by two linear segments joined by an arc. bcnorm := bc.Norm(k.u).Rot90CCW() if abnorm.Dot(bcnorm) < -Fix64(k.u)*Fix64(k.u)*2047/2048 { pArc := abnorm.Dot(bc) < 0 k.p.Add1(mabc.Add(abnorm)) if pArc { z := abnorm.Rot90CW() addArc(k.p, mabc, abnorm, z) addArc(k.p, mabc, z, bcnorm) } k.p.Add1(mabc.Add(bcnorm)) k.p.Add1(c.Add(bcnorm)) k.r.Add1(mabc.Sub(abnorm)) if !pArc { z := abnorm.Rot90CW() addArc(&k.r, mabc, abnorm.Neg(), z) addArc(&k.r, mabc, z, bcnorm.Neg()) } k.r.Add1(mabc.Sub(bcnorm)) k.r.Add1(c.Sub(bcnorm)) k.a, k.anorm = c, bcnorm return } // Process the decomposed parts. k.addNonCurvy2(mab, mabc) k.addNonCurvy2(mbc, c) } // Add3 adds a cubic segment to the stroker. func (k *stroker) Add3(b, c, d Point) { panic("freetype/raster: stroke unimplemented for cubic segments") } // stroke adds the stroked Path q to p, where q consists of exactly one curve. func (k *stroker) stroke(q Path) { // Stroking is implemented by deriving two paths each k.u apart from q. // The left-hand-side path is added immediately to k.p; the right-hand-side // path is accumulated in k.r. Once we've finished adding the LHS to k.p, // we add the RHS in reverse order. k.r = Path(make([]Fix32, 0, len(q))) k.a = Point{q[1], q[2]} for i := 4; i < len(q); { switch q[i] { case 1: k.Add1(Point{q[i+1], q[i+2]}) i += 4 case 2: k.Add2(Point{q[i+1], q[i+2]}, Point{q[i+3], q[i+4]}) i += 6 case 3: k.Add3(Point{q[i+1], q[i+2]}, Point{q[i+3], q[i+4]}, Point{q[i+5], q[i+6]}) i += 8 default: panic("freetype/raster: bad path") } } if len(k.r) == 0 { return } // TODO(nigeltao): if q is a closed curve then we should join the first and // last segments instead of capping them. k.cr.Cap(k.p, k.u, q.lastPoint(), k.anorm.Neg()) addPathReversed(k.p, k.r) pivot := q.firstPoint() k.cr.Cap(k.p, k.u, pivot, pivot.Sub(Point{k.r[1], k.r[2]})) } // Stroke adds q stroked with the given width to p. The result is typically // self-intersecting and should be rasterized with UseNonZeroWinding. // cr and jr may be nil, which defaults to a RoundCapper or RoundJoiner. func Stroke(p Adder, q Path, width Fix32, cr Capper, jr Joiner) { if len(q) == 0 { return } if cr == nil { cr = RoundCapper } if jr == nil { jr = RoundJoiner } if q[0] != 0 { panic("freetype/raster: bad path") } s := stroker{p: p, u: width / 2, cr: cr, jr: jr} i := 0 for j := 4; j < len(q); { switch q[j] { case 0: s.stroke(q[i:j]) i, j = j, j+4 case 1: j += 4 case 2: j += 6 case 3: j += 8 default: panic("freetype/raster: bad path") } } s.stroke(q[i:]) } 3-3.11.1/go.mod000066400000000000000000000000451503346766200130600ustar00rootroot00000000000000module github.com/mumax/3 go 1.22.4 3-3.11.1/gui/000077500000000000000000000000001503346766200125375ustar00rootroot000000000000003-3.11.1/gui/Makefile000066400000000000000000000000211503346766200141700ustar00rootroot00000000000000all: go install 3-3.11.1/gui/button.go000066400000000000000000000006511503346766200144030ustar00rootroot00000000000000package gui import "fmt" type button struct { data } func (e *button) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "innerHTML", e.value()}}} } func (d *Page) Button(id string, value interface{}, extra ...string) string { e := &button{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, id) } 3-3.11.1/gui/checkbox.go000066400000000000000000000007031503346766200146540ustar00rootroot00000000000000package gui import "fmt" type checkbox struct { data } func (e *checkbox) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "checked", e.value()}}} } func (d *Page) Checkbox(id, text string, value bool, extra ...string) string { e := &checkbox{data: data{value}} d.addElem(id, e) return fmt.Sprintf(`%v`, id, id, text) } 3-3.11.1/gui/clibox.go000066400000000000000000000007231503346766200143500ustar00rootroot00000000000000package gui import "fmt" type clibox struct { data } func (e *clibox) update(id string) []jsCall { return []jsCall{} // We never set the value of the CLI box, only the user does } // Command-line interface textbox where user types commands. func (d *Page) CliBox(id string, value interface{}, extra ...string) string { e := &clibox{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, "text", id, cat(extra)) } 3-3.11.1/gui/console.go000066400000000000000000000006711503346766200145340ustar00rootroot00000000000000package gui import "fmt" type console struct { data } func (e *console) update(id string) []jsCall { return []jsCall{{F: "setConsoleText", Args: []interface{}{e.value()}}} } func (d *Page) Console(id string, rows, cols int, value interface{}, extra ...string) string { e := &console{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, rows, cols, cat(extra)) } 3-3.11.1/gui/data.go000066400000000000000000000002311503346766200137730ustar00rootroot00000000000000package gui type data struct { val interface{} } func (d *data) set(v interface{}) { d.val = v } func (d *data) value() interface{} { return d.val } 3-3.11.1/gui/datamodels.go000066400000000000000000000020361503346766200152040ustar00rootroot00000000000000package gui import ( "fmt" "log" "strconv" ) type interfaceData struct { v interface{} } func (d *interfaceData) setValue(v interface{}) { d.v = v } func (d *interfaceData) value() interface{} { return d.v } type boolData struct{ interfaceData } func (d *boolData) setValue(v interface{}) { d.v = v.(bool) } func BoolData(v bool) *boolData { return &boolData{interfaceData{v}} } type intData struct{ interfaceData } func IntData(v int) *intData { return &intData{interfaceData{v}} } func (d *intData) setValue(v interface{}) { switch v := v.(type) { case int: d.v = v default: i, err := strconv.Atoi(fmt.Sprint(v)) if err == nil { d.v = i } else { log.Println(err) } } } type floatData struct{ interfaceData } func FloatData(v float64) *floatData { return &floatData{interfaceData{v}} } func (d *floatData) setValue(v interface{}) { switch v := v.(type) { case float64: d.v = v default: i, err := strconv.ParseFloat(fmt.Sprint(v), 64) if err == nil { d.v = i } else { log.Println(err) } } } 3-3.11.1/gui/el.go000066400000000000000000000044611503346766200134730ustar00rootroot00000000000000package gui import "sync" // wraps a GUI element (button, textbox, ...), // stores the dirty flag, extra attributes, lock event handler, ... type E struct { _m sync.Mutex _dirty bool // dirty means the value/attributes need updating in browser _attr map[string]interface{} // extra html attributes (e.g. style, onclick, ...) _elem El // the wrapped gui element onevent func() // called upon value change by user (not by Go code) } func newE(elem El) *E { return &E{_elem: elem, _dirty: true} } // atomically pass a new value to the underlying element and mark it dirty. func (e *E) set(v interface{}) { e._m.Lock() defer e._m.Unlock() old := e._elem.value() // carefully check if value changed, set/value may do things behind the screens e._elem.set(v) if e._elem.value() != old { e._dirty = true } } // atomically set an html attribute for the underlying element and mark it dirty func (e *E) attr(key string, v interface{}) { e._m.Lock() defer e._m.Unlock() if e._attr == nil { e._attr = make(map[string]interface{}) } old := e._attr[key] if v != old { e._dirty = true } e._attr[key] = v } // atomically produce a list of javascript calls needed to update the element in the browser, // and clear dirty flag func (e *E) update(id string) []jsCall { e._m.Lock() defer e._m.Unlock() if !e._dirty { return []jsCall{} } upd := e._elem.update(id) for k, v := range e._attr { upd = append(upd, jsCall{F: "setAttr", Args: []interface{}{id, k, v}}) } e._dirty = false return upd } // atomically returns the underlying element's value // depending its implementation (e.g. textBox's text, checkBox's checked value, etc.) func (e *E) value() interface{} { e._m.Lock() defer e._m.Unlock() return e._elem.value() } // atomically set the dirty flag w/o changing value. // called, e.g., when a second browser window opens func (e *E) setDirty() { e._m.Lock() defer e._m.Unlock() e._dirty = true } // Atomically set a new onevent function, which is called each time // the user changes the underlying elements value. func (e *E) OnEvent(f func()) { e._m.Lock() defer e._m.Unlock() e.onevent = f } // Underlying html element like Span, TextBox, etc. type El interface { update(id string) []jsCall set(v interface{}) value() interface{} } 3-3.11.1/gui/element.go000066400000000000000000000006151503346766200145210ustar00rootroot00000000000000package gui import "fmt" type element struct { data } func (e *element) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "innerHTML", e.value()}}} } func (d *Page) Element(id, typ, attr string, value interface{}, extra ...string) string { e := &element{data: data{value}} d.addElem(id, e) return fmt.Sprintf(` `, id, cat(extra)) } 3-3.11.1/gui/img.go000066400000000000000000000005451503346766200136460ustar00rootroot00000000000000package gui import "fmt" type img struct { data } func (e *img) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "src", e.value()}}} } func (d *Page) Img(id string, value interface{}, extra ...string) string { e := &img{data: data{value}} d.addElem(id, e) return fmt.Sprintf(` `, id, cat(extra)) } 3-3.11.1/gui/js.go000066400000000000000000000105731503346766200135100ustar00rootroot00000000000000package gui // Javascript for the GUI page. const JS = `` 3-3.11.1/gui/meter.go000066400000000000000000000006021503346766200142000ustar00rootroot00000000000000package gui import "fmt" type meter struct { data } func (e *meter) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Meter(id string, min, max, value int, extra ...string) string { e := &meter{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, min, max) } 3-3.11.1/gui/number.go000066400000000000000000000010241503346766200143530ustar00rootroot00000000000000package gui import "fmt" type number struct { data } func (e *number) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Number(id string, min, max, value int, extra ...string) string { e := &number{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, "number", id, id, id, id, min, max, cat(extra)) } 3-3.11.1/gui/page.go000066400000000000000000000124431503346766200140060ustar00rootroot00000000000000package gui import ( "bytes" "encoding/json" "fmt" "log" "net/http" "sync" "text/template" ) var Debug = false // Page holds the state to serve a single GUI page to the browser type Page struct { elems map[string]*E htmlCache []byte // static html content, rendered only once haveJS bool // have called JS()? data interface{} // any additional data to be passed to template onUpdate func() onAnyEvent func() httpLock sync.Mutex lastPageID string } // NewPage constructs a Page based on an HTML template containing // element tags like {{.Button}}, {{.Textbox}}, etc. data is fed // to the template as additional arbitrary data, available as {{.Data}}. func NewPage(htmlTemplate string, data interface{}) *Page { d := &Page{elems: make(map[string]*E), data: data} // exec template (once) t := template.Must(template.New("").Parse(htmlTemplate)) cache := bytes.NewBuffer(nil) check(t.Execute(cache, d)) d.htmlCache = cache.Bytes() // check if template contains {{.JS}} if !d.haveJS { log.Panic("template should call {{.JS}}") } return d } // Value returns the value of the HTML element with given id. // E.g.: the text in a textbox, the checked value of a checkbox, etc. func (d *Page) Value(id string) interface{} { return d.elem(id).value() } // StringValue is like Value but returns the value as string, // converting if necessary. func (d *Page) StringValue(id string) string { v := d.Value(id) if s, ok := v.(string); ok { return s } else { return fmt.Sprint(v) } } func (d *Page) Set(id string, v interface{}) { d.elem(id).set(v) } func (d *Page) Attr(id string, k string, v interface{}) { d.elem(id).attr(k, v) } // OnEvent sets a handler to be called when an event happens // to the HTML element with given id. The event depends on the // element type: click for Button, change for TextBox, etc... func (d *Page) OnEvent(id string, handler func()) { d.elem(id).onevent = handler } // OnEvent sets a handler to be called when an event happens // to any of the page's HTML elements. func (d *Page) OnAnyEvent(handler func()) { d.onAnyEvent = handler } // Set func to be executed each time javascript polls for updates func (d *Page) OnUpdate(f func()) { d.onUpdate = f } // {{.JS}} should always be embedded in the template . // Expands to needed JavaScript code. func (d *Page) JS() string { d.haveJS = true return JS } // {{.ErrorBox}} should be embedded in the template where errors are to be shown. // CSS rules for class ErrorBox may be set, e.g., to render errors in red. func (t *Page) ErrorBox() string { return ` ` } // {{.UpdateButton}} adds a page Update button func (t *Page) UpdateButton(text string) string { if text == "" { text = `↻` } return `` } // {{.UpdateBox}} adds an auto update checkbox func (t *Page) UpdateBox(text string) string { if text == "" { text = "auto update" } return `` + text + `` } // {{.Data}} returns the extra data that was passed to NewPage func (t *Page) Data() interface{} { return t.data } // return elem[id], panic if non-existent func (d *Page) elem(id string) *E { if e, ok := d.elems[id]; ok { return e } else { panic("no element with id: " + id) } } // elem[id] = e, panic if already defined func (d *Page) addElem(id string, e El) { if _, ok := d.elems[id]; ok { panic("addElem: already defined: " + id) } else { d.elems[id] = newE(e) } } // ServeHTTP implements http.Handler. func (d *Page) ServeHTTP(w http.ResponseWriter, r *http.Request) { d.httpLock.Lock() defer d.httpLock.Unlock() switch r.Method { default: http.Error(w, "not allowed: "+r.Method+" "+r.URL.Path, http.StatusForbidden) case "GET": d.serveContent(w, r) case "POST": d.serveUpdate(w, r) case "PUT": d.serveEvent(w, r) } } // serves the html content. func (d *Page) serveContent(w http.ResponseWriter, r *http.Request) { w.Write(d.htmlCache) } // HTTP handler for event notifications by button clicks etc func (d *Page) serveEvent(w http.ResponseWriter, r *http.Request) { var ev event check(json.NewDecoder(r.Body).Decode(&ev)) if Debug { fmt.Println(ev) } if d.onAnyEvent != nil { d.onAnyEvent() } el := d.elem(ev.ID) el.set(ev.Arg) if el.onevent != nil { el.onevent() } } type event struct { ID string Arg interface{} } // HTTP handler for updating the dynamic elements func (d *Page) serveUpdate(w http.ResponseWriter, r *http.Request) { if d.onUpdate != nil { d.onUpdate() } // read page ID from body buf := make([]byte, 100) r.Body.Read(buf) pageID := string(buf) if pageID != d.lastPageID { for _, e := range d.elems { e.setDirty() } d.lastPageID = pageID } calls := make([]jsCall, 0, len(d.elems)) for id, e := range d.elems { calls = append(calls, e.update(id)...) // update atomically checks dirty and clears it } if Debug && len(calls) != 0 { fmt.Println(calls) // debug } check(json.NewEncoder(w).Encode(calls)) } // javascript call type jsCall struct { F string // function to call Args []interface{} // function arguments } func check(err error) { if err != nil { log.Panic(err) } } 3-3.11.1/gui/page_test.go000066400000000000000000000001031503346766200150330ustar00rootroot00000000000000package gui import "testing" func TestNewPage(t *testing.T) { } 3-3.11.1/gui/progress.go000066400000000000000000000006031503346766200147310ustar00rootroot00000000000000package gui import "fmt" type progress struct { data } func (e *progress) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Progress(id string, max, value int, extra ...string) string { e := &progress{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, max) } 3-3.11.1/gui/range.go000066400000000000000000000007221503346766200141630ustar00rootroot00000000000000package gui import "fmt" type slider struct { data } func (e *slider) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Range(id string, min, max, value int, extra ...string) string { e := &slider{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, min, max, id) } 3-3.11.1/gui/select.go000066400000000000000000000012751503346766200143520ustar00rootroot00000000000000package gui import "fmt" type sel struct { data } func (e *sel) update(id string) []jsCall { return []jsCall{{F: "setSelect", Args: []interface{}{id, e.value()}}} } func (d *Page) SelectArray(id string, value string, options []string) string { return d.Select(id, value, options...) } func (d *Page) Select(id string, value string, options ...string) string { e := &sel{data: data{value}} d.addElem(id, e) html := fmt.Sprintf(`` return html } 3-3.11.1/gui/span.go000066400000000000000000000006741503346766200140360ustar00rootroot00000000000000package gui import "fmt" type span struct { data } func (e *span) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "innerHTML", e.value()}}} } // {{.Span id value}} adds a piece of text ("label") to the document. func (d *Page) Span(id string, value interface{}, extra ...string) string { e := &span{data: data{value}} d.addElem(id, e) return fmt.Sprintf(` `, id, cat(extra)) } 3-3.11.1/gui/textbox.go000066400000000000000000000010341503346766200145610ustar00rootroot00000000000000package gui import "fmt" type textbox struct { data } func (e *textbox) update(id string) []jsCall { return []jsCall{{F: "setTextbox", Args: []interface{}{id, e.value()}}} } func (d *Page) TextBox(id string, value interface{}, extra ...string) string { e := &textbox{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, "text", id, id, id, id, id, cat(extra)) } 3-3.11.1/gui/util.go000066400000000000000000000002071503346766200140420ustar00rootroot00000000000000package gui // concatenate elements func cat(s []string) string { str := "" for _, s := range s { str += s + " " } return str } 3-3.11.1/httpfs/000077500000000000000000000000001503346766200132635ustar00rootroot000000000000003-3.11.1/httpfs/Makefile000066400000000000000000000000211503346766200147140ustar00rootroot00000000000000all: go install 3-3.11.1/httpfs/client.go000066400000000000000000000101311503346766200150640ustar00rootroot00000000000000package httpfs // client-side API import ( "bytes" "encoding/json" "errors" "fmt" "io" "net/http" "net/url" "path" "strings" ) var wd = "" // working directory, see SetWD // SetWD sets a "working directory" for the client side, // prefixed to all relative local paths passed to client functions (Mkdir, Touch, Remove, ...). // dir may start with "http://", turning local relative client paths into remote paths. // E.g.: // // http://path -> http://path // path/file -> wd/path/file // /path/file -> /path/file func SetWD(dir string) { if dir != "" && !strings.HasSuffix(dir, "/") { dir = dir + "/" } wd = dir } // Mkdir creates a directory at specified URL. func Mkdir(URL string) error { URL = addWorkDir(URL) if isRemote(URL) { return httpMkdir(URL) } else { return localMkdir(URL) } } // Touch creates an empty file at the specified URL. func Touch(URL string) error { URL = addWorkDir(URL) if isRemote(URL) { return httpTouch(URL) } else { return localTouch(URL) } } // ReadDir reads and returns all file names in the directory at URL. func ReadDir(URL string) ([]string, error) { URL = addWorkDir(URL) if isRemote(URL) { return httpLs(URL) } else { return localLs(URL) } } // Remove removes the file or directory at URL, and all children it may contain. // Similar to os.RemoveAll. func Remove(URL string) error { URL = addWorkDir(URL) if isRemote(URL) { return httpRemove(URL) } else { return localRemove(URL) } } // Read the entire file and return its contents. func Read(URL string) ([]byte, error) { URL = addWorkDir(URL) if isRemote(URL) { return httpRead(URL) } else { return localRead(URL) } } // Append p to the file given by URL, // but first assure that the file had the expected size. // Used to avoid accidental concurrent writes by two processes to the same file. // Size < 0 disables size check. func AppendSize(URL string, p []byte, size int64) error { URL = addWorkDir(URL) if isRemote(URL) { return httpAppend(URL, p, size) } else { return localAppend(URL, p, size) } } // Append p to the file given by URL. func Append(URL string, p []byte) error { return AppendSize(URL, p, -1) } // Create file given by URL and put data from p there. func Put(URL string, p []byte) error { URL = addWorkDir(URL) if isRemote(URL) { return httpPut(URL, p) } else { return localPut(URL, p) } } func isRemote(URL string) bool { return strings.HasPrefix(URL, "http://") } // prefix wd to URL if URL is a relative file path // does not start with "/", "http://" func addWorkDir(URL string) string { if isRemote(URL) { return URL } if !path.IsAbs(URL) { return wd + URL } return URL } func httpMkdir(URL string) error { _, err := do(MKDIR, URL, nil, nil) return err } func httpTouch(URL string) error { _, err := do(TOUCH, URL, nil, nil) return err } func httpLs(URL string) (ls []string, err error) { r, errHTTP := do(LS, URL, nil, nil) if errHTTP != nil { return nil, errHTTP } errJSON := json.Unmarshal(r, &ls) if errJSON != nil { return nil, mkErr(LS, URL, errJSON) } return ls, nil } func httpAppend(URL string, data []byte, size int64) error { var query map[string][]string if size >= 0 { query = map[string][]string{"size": {fmt.Sprint(size)}} } _, err := do(APPEND, URL, data, query) return err } func httpPut(URL string, data []byte) error { _, err := do(PUT, URL, data, nil) return err } func httpRead(URL string) ([]byte, error) { return do(READ, URL, nil, nil) } func httpRemove(URL string) error { _, err := do(RM, URL, nil, nil) return err } // do a http request. func do(a action, URL string, body []byte, query url.Values) (resp []byte, err error) { u, err := url.Parse(URL) u.Path = string(a) + path.Clean("/"+u.Path) u.RawQuery = query.Encode() response, errR := http.Post(u.String(), "data", bytes.NewReader(body)) if errR != nil { return nil, mkErr(a, URL, errR) } defer response.Body.Close() if response.StatusCode != http.StatusOK { return nil, errors.New("do " + u.String() + ":" + response.Status + ":" + readBody(response.Body)) } resp, err = io.ReadAll(response.Body) err = mkErr(a, URL, err) return } 3-3.11.1/httpfs/httpfs.go000066400000000000000000000044241503346766200151260ustar00rootroot00000000000000/* Package httpfs provides a (userspace) file system API over http. httpfs is used by mumax3-server to proved file system access to the compute nodes. The API is similar to go's os package, but both local file names and URLs may be passed. When the file "name" starts with "http://", it is treated as a remote file, otherwise it is local. Hence, the same API is used for local and remote file access. */ package httpfs import ( "fmt" "io" "log" "os" "path" ) var Logging = false // enables logging const ( DirPerm = 0777 // permissions for new directory FilePerm = 0666 // permissions for new files ) func readBody(r io.ReadCloser) string { defer r.Close() b, err := io.ReadAll(r) if err != nil { log.Println("readbody:", err) return "" } return string(b) } func mkErr(a action, URL string, err error) error { if err == nil { return nil } else { return fmt.Errorf("httpfs %v %v: %v", a, URL, err) } } func localMkdir(fname string) error { return os.Mkdir(fname, DirPerm) } func localTouch(fname string) error { f, err := os.OpenFile(fname, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0666) if err == nil { f.Close() } return err } func localLs(fname string) ([]string, error) { f, err := os.Open(fname) if err != nil { return nil, err } defer f.Close() ls, err2 := f.Readdirnames(-1) if err2 != nil { return nil, err2 } return ls, nil } func localAppend(fname string, data []byte, size int64) error { f, err := os.OpenFile(fname, os.O_APPEND|os.O_WRONLY, FilePerm) if err != nil { return err } defer f.Close() if size >= 0 { fi, errFi := f.Stat() if errFi != nil { return errFi } if size != fi.Size() { return fmt.Errorf(`httpfs: file size mismatch, possible concurrent access. size=%v B, expected=%v B`, fi.Size(), size) } } _, err2 := f.Write(data) return err2 } func localPut(fname string, data []byte) error { _ = os.MkdirAll(path.Dir(fname), DirPerm) f, err := os.OpenFile(fname, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, FilePerm) if err != nil { return err } defer f.Close() _, err2 := f.Write(data) return err2 } func localRead(fname string) ([]byte, error) { return os.ReadFile(fname) } func localRemove(fname string) error { return os.RemoveAll(fname) } func Log(msg ...interface{}) { if Logging { log.Println(msg...) } } 3-3.11.1/httpfs/httpfs_test.go000066400000000000000000000130321503346766200161600ustar00rootroot00000000000000package httpfs import ( "fmt" "net" "net/http" "testing" ) // leaving this many files open is supposed to trigger os error. const MANYFILES = 1025 // start local httpfs server, and use http://address/ as WD func init() { l, err := net.Listen("tcp", ":12345") if err != nil { panic(err) } addr := "http://" + l.Addr().String() SetWD(addr) RegisterHandlers() fmt.Println("serving httpfs:", addr) go func() { if err := http.Serve(l, nil); err != nil { panic(err) } }() } func TestMkdirRemove(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) mustFail(t, Mkdir("testdata")) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, Remove("testdata")) mustPass(t, Mkdir("testdata")) } } func TestMkdir(t *testing.T) { Remove("testdata") defer Remove("testdata") mustFail(t, Mkdir("testdata/bla/bla")) mustPass(t, Mkdir("testdata/")) mustPass(t, Mkdir("testdata/bla")) mustPass(t, Mkdir("testdata/bla/bla")) } func TestTouch(t *testing.T) { Remove("testdata") defer Remove("testdata") mustFail(t, Touch("testdata/file")) mustPass(t, Mkdir("testdata/")) mustPass(t, Touch("testdata/file")) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, Touch("testdata/file")) } } func TestReaddir(t *testing.T) { Remove("testdata") defer Remove("testdata") s := func(s []string, e error) error { return e } mustFail(t, s(ReadDir("testdata"))) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustFail(t, s(ReadDir("testdata"))) } mustPass(t, Mkdir("testdata/")) mustPass(t, Touch("testdata/file1")) mustPass(t, Touch("testdata/file2")) mustPass(t, Touch("testdata/file3")) ls, err := ReadDir("testdata") if err != nil { t.Error(err) } if len(ls) != 3 { t.Fail() } // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, s(ReadDir("testdata"))) } } func TestRemove(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Remove("testdata")) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, Remove("testdata")) } } func TestAppendRead(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) data := []byte("hello httpfs\n") mustFail(t, Append("testdata/file", data)) // file does not exist yet mustPass(t, Touch("testdata/file")) for i := 0; i < MANYFILES; i++ { mustPass(t, Append("testdata/file", data)) } b, errR := Read("testdata/file") if errR != nil { t.Error(errR) } if len(b) != (MANYFILES)*len(data) { t.Error(len(b), (MANYFILES+1)*len(data)) } } func TestConcurrentWrite(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) mustPass(t, Touch("testdata/file")) f1 := MustCreate("testdata/file") f2 := MustCreate("testdata/file") fmt.Fprintln(f1, "a") mustPass(t, f1.Flush()) fmt.Fprintln(f2, "a") mustFail(t, f2.Flush()) for i := 0; i < MANYFILES; i++ { fmt.Fprintln(f1, "a") mustPass(t, f1.Flush()) fmt.Fprintln(f2, "a") mustFail(t, f2.Flush()) } } func TestAppendSize(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) data := []byte("hello httpfs\n") mustFail(t, AppendSize("testdata/file", data, 0)) // file does not exist yet mustFail(t, AppendSize("testdata/file", data, 1)) // file does not exist yet mustPass(t, Touch("testdata/file")) for i := 0; i < MANYFILES; i++ { mustPass(t, AppendSize("testdata/file", data, int64(i)*int64(len(data)))) } b, errR := Read("testdata/file") if errR != nil { t.Error(errR) } if len(b) != (MANYFILES)*len(data) { t.Error(len(b), (MANYFILES+1)*len(data)) } } func TestAppendSizeBad(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) mustPass(t, Touch("testdata/file")) data := []byte("hello httpfs\n") for i := 0; i < MANYFILES; i++ { mustFail(t, AppendSize("testdata/file", data, 3)) // bad size } } func TestPutRead(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) data := []byte("hello httpfs\n") // must pass if file does not yet exist for i := 0; i < MANYFILES; i++ { mustPass(t, Put("testdata/file", data)) } b, errR := Read("testdata/file") if errR != nil { t.Error(errR) } if len(b) != len(data) { t.Error(len(b), (MANYFILES+1)*len(data)) } } func TestReaderWriter(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) // open file for reading when it's not yet there { out, errO := Open("testdata/file") if errO == nil { t.Fail() } if out != nil { t.Fail() } } for i := 0; i < MANYFILES; i++ { // create and write to file { out, errO := Create("testdata/file") if errO != nil { t.Fail() } if out == nil { t.Fail() } _, errW := fmt.Fprintln(out, "hello_httpfs") if errW != nil { t.Fail() } mustPass(t, out.Close()) } // open file for reading and check content { f, errO := Open("testdata/file") if errO != nil { t.Fail() } if f == nil { t.Fail() } var str string _, err := fmt.Fscan(f, &str) if err != nil { t.Error(err) } if str != "hello_httpfs" { t.Error(str) } if i == 0 { mustPass(t, f.Close()) // it's not needed to close the file } } } } func mustPass(t *testing.T, err error) { if err != nil { t.Fatal(err) } } func mustFail(t *testing.T, err error) { if err == nil { t.Fatal("did not get error") } } 3-3.11.1/httpfs/reader.go000066400000000000000000000027761503346766200150700ustar00rootroot00000000000000package httpfs // Utility functions on top of standard httpfs protocol import ( "bufio" "bytes" "io" ) const BUFSIZE = 16 * 1024 * 1024 // bufio buffer size // create a file for writing, clobbers previous content if any. func Create(URL string) (WriteCloseFlusher, error) { _ = Remove(URL) err := Touch(URL) if err != nil { return nil, err } return &bufWriter{bufio.NewWriterSize(&appendWriter{URL, 0}, BUFSIZE)}, nil } func MustCreate(URL string) WriteCloseFlusher { f, err := Create(URL) if err != nil { panic(err) } return f } type WriteCloseFlusher interface { io.WriteCloser Flush() error } // open a file for reading func Open(URL string) (io.ReadCloser, error) { data, err := Read(URL) if err != nil { return nil, err } return io.NopCloser(bytes.NewReader(data)), nil } func MustOpen(URL string) io.ReadCloser { f, err := Open(URL) if err != nil { panic(err) } return f } type bufWriter struct { buf *bufio.Writer } func (w *bufWriter) Write(p []byte) (int, error) { return w.buf.Write(p) } func (w *bufWriter) Close() error { err := w.buf.Flush() w.buf = nil // Dangling pointer somewhere? if err != nil { return err } return nil } func (w *bufWriter) Flush() error { return w.buf.Flush() } type appendWriter struct { URL string byteCount int64 } func (w *appendWriter) Write(p []byte) (int, error) { err := AppendSize(w.URL, p, w.byteCount) if err != nil { return 0, err // don't know how many bytes written } w.byteCount += int64(len(p)) return len(p), nil } 3-3.11.1/httpfs/server.go000066400000000000000000000054421503346766200151250ustar00rootroot00000000000000package httpfs // server-side httpfs code import ( "encoding/json" "io" "net/http" "net/url" "strconv" ) // file action gets its own type to avoid mixing up with other strings type action string // httpfs actions, handled at /actionName/ (e.g. /ls/, /mkdir/, ...) const ( APPEND action = "append" LS action = "ls" MKDIR action = "mkdir" PUT action = "put" READ action = "read" RM action = "rm" TOUCH action = "touch" ) // RegisterHandlers sets up the http handlers needed for the httpfs protocol (calling go's http.Handle). // After RegisterHandlers, http.ListenAndServe may be called. func RegisterHandlers() { m := map[action]handlerFunc{ APPEND: handleAppend, LS: handleLs, MKDIR: handleMkdir, PUT: handlePut, READ: handleRead, RM: handleRemove, TOUCH: handleTouch, } for k, v := range m { http.HandleFunc("/"+string(k)+"/", newHandler(k, v)) } http.Handle("/fs/", http.StripPrefix("/fs/", http.FileServer(http.Dir(".")))) } // general handler func for file name, optional URL query, input data and response writer. type handlerFunc func(fname string, data []byte, w io.Writer, query url.Values) error func newHandler(prefix action, f handlerFunc) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { fname := r.URL.Path[len(prefix)+2:] // strip "/prefix/" query := r.URL.Query() data, err := io.ReadAll(r.Body) Log("httpfs req:", prefix, fname, query.Encode(), len(data), "B payload") if err != nil { Log("httpfs err:", prefix, fname, ":", err) http.Error(w, err.Error(), http.StatusBadRequest) } err2 := f(fname, data, w, query) if err2 != nil { Log("httpfs err:", prefix, fname, ":", err2) http.Error(w, err2.Error(), http.StatusInternalServerError) } } } func handleAppend(fname string, data []byte, w io.Writer, q url.Values) error { size := int64(-1) s := q.Get("size") if s != "" { var err error size, err = strconv.ParseInt(s, 0, 64) if err != nil { return err } } return localAppend(fname, data, size) } func handlePut(fname string, data []byte, w io.Writer, q url.Values) error { return localPut(fname, data) } func handleLs(fname string, data []byte, w io.Writer, q url.Values) error { ls, err := localLs(fname) if err != nil { return err } return json.NewEncoder(w).Encode(ls) } func handleMkdir(fname string, data []byte, w io.Writer, q url.Values) error { return localMkdir(fname) } func handleTouch(fname string, data []byte, w io.Writer, q url.Values) error { return localTouch(fname) } func handleRead(fname string, data []byte, w io.Writer, q url.Values) error { b, err := localRead(fname) if err != nil { return err } _, err2 := w.Write(b) return err2 } func handleRemove(fname string, data []byte, w io.Writer, q url.Values) error { return localRemove(fname) } 3-3.11.1/mag/000077500000000000000000000000001503346766200125175ustar00rootroot000000000000003-3.11.1/mag/Makefile000066400000000000000000000000251503346766200141540ustar00rootroot00000000000000all: go install -v 3-3.11.1/mag/constants.go000066400000000000000000000005121503346766200150600ustar00rootroot00000000000000// package mag provides magnetism-specific constants and the demag kernel. package mag import "math" const ( Mu0 = 4 * math.Pi * 1e-7 // Permeability of vacuum in Tm/A MuB = 9.2740091523e-24 // Bohr magneton in J/T Kb = 1.380650424e-23 // Boltzmann's constant in J/K Qe = 1.60217646e-19 // Electron charge in C ) 3-3.11.1/mag/demagkernel.go000066400000000000000000000356451503346766200153410ustar00rootroot00000000000000package mag import ( "bufio" "fmt" "math" "os" "github.com/mumax/3/data" "github.com/mumax/3/oommf" "github.com/mumax/3/timer" "github.com/mumax/3/util" ) // Obtains the demag kernel either from cacheDir/ or by calculating (and then storing in cacheDir for next time). // Empty cacheDir disables caching. func DemagKernel(inputSize, pbc [3]int, cellsize [3]float64, accuracy float64, cacheDir string) (kernel [3][3]*data.Slice) { timer.Start("kernel_init") timer.Stop("kernel_init") // warm-up timer.Start("kernel_init") defer timer.Stop("kernel_init") sanityCheck(cellsize) // Cache disabled if cacheDir == "" { util.Log(`//Not using kernel cache (-cache="")`) return CalcDemagKernel(inputSize, pbc, cellsize, accuracy) } // Error-resilient kernel cache: if anything goes wrong, return calculated kernel. defer func() { if err := recover(); err != nil { util.Log("//Unable to use kernel cache:", err) kernel = CalcDemagKernel(inputSize, pbc, cellsize, accuracy) } }() // Try to load kernel basename := fmt.Sprint(cacheDir, "/", "mumax3kernel_", inputSize, "_", pbc, "_", cellsize, "_", accuracy, "_") var errLoad error for i := 0; i < 3; i++ { for j := i; j < 3; j++ { if inputSize[Z] == 1 && ((i == X && j == Z) || (i == Y && j == Z)) { continue // element not needed in 2D } kernel[i][j], errLoad = LoadKernel(fmt.Sprint(basename, i, j, ".ovf")) if errLoad != nil { break } } if errLoad != nil { break } } // make result symmetric for tools that expect it so. kernel[Y][X] = kernel[X][Y] kernel[Z][X] = kernel[X][Z] kernel[Z][Y] = kernel[Y][Z] if errLoad != nil { util.Log("//Did not use cached kernel:", errLoad) } else { util.Log("//Using cached kernel:", basename) return kernel } // Could not load kernel: calculate it and save var errSave error kernel = CalcDemagKernel(inputSize, pbc, cellsize, accuracy) for i := 0; i < 3; i++ { for j := i; j < 3; j++ { if inputSize[Z] == 1 && ((i == X && j == Z) || (i == Y && j == Z)) { continue // element not needed in 2D } compName := fmt.Sprint("N_", i, j) info := data.Meta{Time: float64(0.0), Name: compName, Unit: "1", CellSize: cellsize, MeshUnit: "m"} errSave = SaveKernel(fmt.Sprint(basename, i, j, ".ovf"), kernel[i][j], info) if errSave != nil { break } } if errSave != nil { break } } if errSave != nil { util.Log("//Failed to cache kernel:", errSave) } else { util.Log("//Cached kernel:", basename) } return kernel } func LoadKernel(fname string) (kernel *data.Slice, err error) { kernel, _, err = oommf.ReadFile(fname) return } func SaveKernel(fname string, kernel *data.Slice, info data.Meta) error { f, err := os.OpenFile(fname, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) if err != nil { return err } out := bufio.NewWriter(f) defer out.Flush() oommf.WriteOVF2(out, kernel, info, "binary 4") return nil } // Calculates the magnetostatic kernel by brute-force integration // of magnetic charges over the faces and averages over cell volumes. func CalcDemagKernel(inputSize, pbc [3]int, cellsize [3]float64, accuracy float64) (kernel [3][3]*data.Slice) { // Add zero-padding in non-PBC directions size := padSize(inputSize, pbc) // Sanity check { util.Assert(size[Z] > 0 && size[Y] > 0 && size[X] > 0) util.Assert(cellsize[X] > 0 && cellsize[Y] > 0 && cellsize[Z] > 0) util.Assert(pbc[X] >= 0 && pbc[Y] >= 0 && pbc[Z] >= 0) util.Assert(accuracy > 0) } // Allocate only upper diagonal part. The rest is symmetric due to reciprocity. var array [3][3][][][]float32 for i := 0; i < 3; i++ { for j := i; j < 3; j++ { kernel[i][j] = data.NewSlice(1, size) array[i][j] = kernel[i][j].Scalars() } } // Field (destination) loop ranges r1, r2 := kernelRanges(size, pbc) // smallest cell dimension is our typical length scale L := cellsize[X] { if cellsize[Y] < L { L = cellsize[Y] } if cellsize[Z] < L { L = cellsize[Z] } } progress, progmax := 0, (1+(r2[Y]-r1[Y]))*(1+(r2[Z]-r1[Z])) // progress bar util.Progress(progress, progmax, "Calculating demag kernel") // To make sure 0% is printed done := make(chan struct{}, 3) // parallel calculation of one component done? // Start brute integration // 9 nested loops, does that stress you out? // Fortunately, the 5 inner ones usually loop over just one element. for s := 0; s < 3; s++ { // source index Ksdxyz (parallelized over) go func(s int) { u, v, w := s, (s+1)%3, (s+2)%3 // u = direction of source (s), v & w are the orthogonal directions var ( R, R2 [3]float64 // field and source cell center positions pole [3]float64 // position of point charge on the surface points int // counts used integration points ) for z := r1[Z]; z <= r2[Z]; z++ { zw := wrap(z, size[Z]) // skip one half, reconstruct from symmetry later // check on wrapped index instead of loop range so it also works for PBC if zw > size[Z]/2 { if s == 2 { // Choose s == 2: most commonly, dz is smallest cell dimension, resulting in largest nv, nw etc. for s=2 progmax -= (1 + (r2[Y] - r1[Y])) util.Progress(progress, progmax, "Calculating demag kernel") } continue } R[Z] = float64(z) * cellsize[Z] for y := r1[Y]; y <= r2[Y]; y++ { yw := wrap(y, size[Y]) if yw > size[Y]/2 { if s == 2 { progmax-- util.Progress(progress, progmax, "Calculating demag kernel") } continue } R[Y] = float64(y) * cellsize[Y] if s == 2 { // show progress of only one component progress++ util.Progress(progress, progmax, "Calculating demag kernel") } for x := r1[X]; x <= r2[X]; x++ { xw := wrap(x, size[X]) if xw > size[X]/2 { continue } R[X] = float64(x) * cellsize[X] // choose number of integration points depending on how far we are from source. dx, dy, dz := delta(x)*cellsize[X], delta(y)*cellsize[Y], delta(z)*cellsize[Z] d := math.Sqrt(dx*dx + dy*dy + dz*dz) if d == 0 { d = L } maxSize := d / accuracy // maximum acceptable integration size nv := int(math.Max(cellsize[v]/maxSize, 1) + 0.5) nw := int(math.Max(cellsize[w]/maxSize, 1) + 0.5) nx := int(math.Max(cellsize[X]/maxSize, 1) + 0.5) ny := int(math.Max(cellsize[Y]/maxSize, 1) + 0.5) nz := int(math.Max(cellsize[Z]/maxSize, 1) + 0.5) // Stagger source and destination grids. // Massively improves accuracy, see note. nv *= 2 nw *= 2 util.Assert(nv > 0 && nw > 0 && nx > 0 && ny > 0 && nz > 0) scale := 1 / float64(nv*nw*nx*ny*nz) surface := cellsize[v] * cellsize[w] // the two directions perpendicular to direction s charge := surface * scale pu1 := cellsize[u] / 2. // positive pole center pu2 := -pu1 // negative pole center // Do surface integral over source cell, accumulate in B var B [3]float64 for i := 0; i < nv; i++ { pv := -(cellsize[v] / 2.) + cellsize[v]/float64(2*nv) + float64(i)*(cellsize[v]/float64(nv)) pole[v] = pv for j := 0; j < nw; j++ { pw := -(cellsize[w] / 2.) + cellsize[w]/float64(2*nw) + float64(j)*(cellsize[w]/float64(nw)) pole[w] = pw // Do volume integral over destination cell for α := 0; α < nx; α++ { rx := R[X] - cellsize[X]/2 + cellsize[X]/float64(2*nx) + (cellsize[X]/float64(nx))*float64(α) for β := 0; β < ny; β++ { ry := R[Y] - cellsize[Y]/2 + cellsize[Y]/float64(2*ny) + (cellsize[Y]/float64(ny))*float64(β) for γ := 0; γ < nz; γ++ { rz := R[Z] - cellsize[Z]/2 + cellsize[Z]/float64(2*nz) + (cellsize[Z]/float64(nz))*float64(γ) points++ pole[u] = pu1 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r := math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr := charge / (4 * math.Pi * r * r * r) bx := R2[X] * qr by := R2[Y] * qr bz := R2[Z] * qr pole[u] = pu2 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r = math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr = -charge / (4 * math.Pi * r * r * r) B[X] += (bx + R2[X]*qr) // addition ordered for accuracy B[Y] += (by + R2[Y]*qr) B[Z] += (bz + R2[Z]*qr) } } } } } for d := s; d < 3; d++ { // destination index Ksdxyz array[s][d][zw][yw][xw] += float32(B[d]) // += needed in case of PBC } } } } done <- struct{}{} // notify parallel computation of this component is done }(s) } // wait for all 3 components to finish <-done <-done <-done // Reconstruct skipped parts from symmetry (X) for z := 0; z < size[Z]; z++ { for y := 0; y < size[Y]; y++ { for x := size[X]/2 + 1; x < size[X]; x++ { x2 := size[X] - x array[X][X][z][y][x] = array[X][X][z][y][x2] array[X][Y][z][y][x] = -array[X][Y][z][y][x2] array[X][Z][z][y][x] = -array[X][Z][z][y][x2] array[Y][Y][z][y][x] = array[Y][Y][z][y][x2] array[Y][Z][z][y][x] = array[Y][Z][z][y][x2] array[Z][Z][z][y][x] = array[Z][Z][z][y][x2] } } } // Reconstruct skipped parts from symmetry (Y) for z := 0; z < size[Z]; z++ { for y := size[Y]/2 + 1; y < size[Y]; y++ { y2 := size[Y] - y for x := 0; x < size[X]; x++ { array[X][X][z][y][x] = array[X][X][z][y2][x] array[X][Y][z][y][x] = -array[X][Y][z][y2][x] array[X][Z][z][y][x] = array[X][Z][z][y2][x] array[Y][Y][z][y][x] = array[Y][Y][z][y2][x] array[Y][Z][z][y][x] = -array[Y][Z][z][y2][x] array[Z][Z][z][y][x] = array[Z][Z][z][y2][x] } } } // Reconstruct skipped parts from symmetry (Z) for z := size[Z]/2 + 1; z < size[Z]; z++ { z2 := size[Z] - z for y := 0; y < size[Y]; y++ { for x := 0; x < size[X]; x++ { array[X][X][z][y][x] = array[X][X][z2][y][x] array[X][Y][z][y][x] = array[X][Y][z2][y][x] array[X][Z][z][y][x] = -array[X][Z][z2][y][x] array[Y][Y][z][y][x] = array[Y][Y][z2][y][x] array[Y][Z][z][y][x] = -array[Y][Z][z2][y][x] array[Z][Z][z][y][x] = array[Z][Z][z2][y][x] } } } // for 2D these elements are zero: if size[Z] == 1 { kernel[X][Z] = nil kernel[Y][Z] = nil } // make result symmetric for tools that expect it so. kernel[Y][X] = kernel[X][Y] kernel[Z][X] = kernel[X][Z] kernel[Z][Y] = kernel[Y][Z] return kernel } // integration ranges for kernel. size=kernelsize, so padded for no PBC, not padded for PBC func kernelRanges(size, pbc [3]int) (r1, r2 [3]int) { for c := 0; c < 3; c++ { if pbc[c] == 0 { r1[c], r2[c] = -(size[c]-1)/2, (size[c]-1)/2 } else { r1[c], r2[c] = -(size[c]*pbc[c] - 1), (size[c]*pbc[c] - 1) // no /2 here, or we would take half right and half left image } } // support for 2D simulations (thickness 1) if size[Z] == 1 && pbc[Z] == 0 { r2[Z] = 0 } return } const ( X = 0 Y = 1 Z = 2 ) // closest distance between cells, given center distance d. // if cells touch by just even a corner, the distance is zero. func delta(d int) float64 { if d < 0 { d = -d } if d > 0 { d -= 1 } return float64(d) } // Wraps an index to [0, max] by adding/subtracting a multiple of max. func wrap(number, max int) int { for number < 0 { number += max } for number >= max { number -= max } return number } const maxAspect = 100.0 // maximum sane cell aspect ratio // Checks if the cell aspect ratio is realistic. func sanityCheck(cellsize [3]float64) { a3 := cellsize[X] / cellsize[Y] a2 := cellsize[Y] / cellsize[Z] a1 := cellsize[Z] / cellsize[X] aMax := math.Max(a1, math.Max(a2, a3)) aMin := math.Min(a1, math.Min(a2, a3)) if aMax > maxAspect || aMin < 1./maxAspect { util.Fatal("Unrealistic cell aspect ratio:", cellsize) } } // Returns the size after zero-padding, taking into account periodic boundary conditions. // In a certain direction, there is no padding in case of PBC (it should wrap around). // Without PBC there should be zero padding up to at least 2*N - 1. In that case there // is a trade-off: for large N, padding up to 2*N can be much more efficient since // power-of-two sized FFT's are ludicrously fast on CUDA. However for very small N, // in particular N=1, we should not over-pad. func padSize(size, periodic [3]int) [3]int { var padded [3]int for i := range size { if periodic[i] != 0 { padded[i] = size[i] continue } if i != Z || size[i] > SMALL_N { // for some reason it only works for Z, perhaps we assume even FFT size elsewhere? // large N: zero pad * 2 for FFT performance padded[i] = size[i] * 2 } else { // small N: minimal zero padding for memory/performance padded[i] = size[i]*2 - 1 } } return padded } // Use 2N-1 padding instead of 2N for sizes up to SMALL_N. // 5 seems a good choice since for all n<=5, 2*n-1 only has // prime factors 2,3,5,7 (good CUFFT performance). // starting from 6 it becomes problematic so we use 2*n. const SMALL_N = 5 // "If brute force doesn't solve your problem, // you're not using enough of it." /* Note: error for cubic self-kernel for different stagger decisions: 1 ++--+----+-++---+----+--++---+----+-++---+----+--++---+----+-++--++ + + + + + + + | | + A + 0.1 ++ A A ++ + A A A A + + C A A A + 0.01 ++ B D E C ++ e + B D E C + r | F B D BE C | r + F D BE DC B+ o 0.001 ++ ++ r + F + + F + 0.0001 ++ F +F + F + | F | + + + + + + + 1e-05 ++--+----+-++---+----+--++---+----+-++---+----+--++---+----+-++--++ 100 1000 10000 100000 1e+06 1e+07 evaluation points A: no staggering B: nv = ((nv + 1) / 2) * 2 nw = ((nw + 1) / 2) * 2 nx = ((nx+1)/2)*2 - 1 ny = ((ny+1)/2)*2 - 1 nz = ((nz+1)/2)*2 - 1 C: nv = ((nv + 1) / 2) * 2 nw = ((nw + 1) / 2) * 2 nx = ((nx+1)/2)*2 + 1 ny = ((ny+1)/2)*2 + 1 nz = ((nz+1)/2)*2 + 1 D: nv += 1 nw += 1 E: nx += 1 ny += 1 nz += 1 F: best with accuracy 6 nv *= 2 nw *= 2 */ 3-3.11.1/mag/mfmkernel.go000066400000000000000000000107271503346766200150350ustar00rootroot00000000000000package mag import ( "bufio" "fmt" "math" "os" d "github.com/mumax/3/data" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) func MFMKernel(mesh *d.Mesh, lift, tipsize float64, cacheDir string) (kernel [3]*d.Slice) { // Cache disabled if cacheDir == "" { util.Log(`//Not using kernel cache (-cache="")`) return CalcMFMKernel(mesh, lift, tipsize) } // Error-resilient kernel cache: if anything goes wrong, return calculated kernel. defer func() { if err := recover(); err != nil { util.Log("//Unable to use kernel cache:", err) kernel = CalcMFMKernel(mesh, lift, tipsize) } }() // Try to load kernel basename := fmt.Sprint(cacheDir, "/", "mumax3MFMkernel_", mesh.Size(), "_", mesh.PBC(), "_", mesh.CellSize(), "_", lift, "_", tipsize, "_") var errLoad error for i := 0; i < 3; i++ { kernel[i], errLoad = LoadKernel(fmt.Sprint(basename, i, ".ovf")) if errLoad != nil { break } } if errLoad != nil { util.Log("//Did not use cached kernel:", errLoad) } else { util.Log("//Using cached kernel:", basename) return kernel } // Could not load kernel: calculate it and save var errSave error kernel = CalcMFMKernel(mesh, lift, tipsize) for i := 0; i < 3; i++ { compName := fmt.Sprint("Nmfm_", i) info := d.Meta{Time: float64(0.0), Name: compName, Unit: "1", CellSize: mesh.CellSize(), MeshUnit: "m"} errSave = SaveKernel(fmt.Sprint(basename, i, ".ovf"), kernel[i], info) if errSave != nil { break } } if errSave != nil { util.Log("//Failed to cache kernel:", errSave) } else { util.Log("//Cached kernel:", basename) } return kernel } func LoadMFMKernel(fname string) (kernel *d.Slice, err error) { kernel, _, err = oommf.ReadFile(fname) return } func SaveMFMKernel(fname string, kernel *d.Slice) error { f, err := os.OpenFile(fname, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) if err != nil { return err } out := bufio.NewWriter(f) defer out.Flush() oommf.WriteOVF2(out, kernel, d.Meta{}, "binary 4") return nil } // Kernel for the vertical derivative of the force on an MFM tip due to mx, my, mz. // This is the 2nd derivative of the energy w.r.t. z. func CalcMFMKernel(mesh *d.Mesh, lift, tipsize float64) (kernel [3]*d.Slice) { const TipCharge = 1 / Mu0 // tip charge const Δ = 1e-9 // tip oscillation, take 2nd derivative over this distance util.AssertMsg(lift > 0, "MFM tip crashed into sample, please lift the new one higher") { // Kernel mesh is 2x larger than input, instead in case of PBC pbc := mesh.PBC() sz := padSize(mesh.Size(), pbc) cs := mesh.CellSize() mesh = d.NewMesh(sz[X], sz[Y], sz[Z], cs[X], cs[Y], cs[Z], pbc[:]...) } // Shorthand size := mesh.Size() pbc := mesh.PBC() cellsize := mesh.CellSize() volume := cellsize[X] * cellsize[Y] * cellsize[Z] fmt.Println("calculating MFM kernel") // Sanity check { util.Assert(size[Z] >= 1 && size[Y] >= 2 && size[X] >= 2) util.Assert(cellsize[X] > 0 && cellsize[Y] > 0 && cellsize[Z] > 0) // util.AssertMsg(size[X]%2 == 0 && size[Y]%2 == 0, "Even kernel size needed") // if size[Z] > 1 { // util.AssertMsg(size[Z]%2 == 0, "Even kernel size needed") // } } // Allocate only upper diagonal part. The rest is symmetric due to reciprocity. var K [3][][][]float32 for i := 0; i < 3; i++ { kernel[i] = d.NewSlice(1, mesh.Size()) K[i] = kernel[i].Scalars() } r1, r2 := kernelRanges(size, pbc) progress, progmax := 0, (1+r2[Y]-r1[Y])*(1+r2[Z]-r1[Z]) for iz := r1[Z]; iz <= r2[Z]; iz++ { zw := wrap(iz, size[Z]) z := float64(iz) * cellsize[Z] for iy := r1[Y]; iy <= r2[Y]; iy++ { yw := wrap(iy, size[Y]) y := float64(iy) * cellsize[Y] progress++ util.Progress(progress, progmax, "Calculating MFM kernel") for ix := r1[X]; ix <= r2[X]; ix++ { x := float64(ix) * cellsize[X] xw := wrap(ix, size[X]) for s := 0; s < 3; s++ { // source index Ksxyz m := d.Vector{0, 0, 0} m[s] = 1 var E [3]float64 // 3 energies for 2nd derivative for i := -1; i <= 1; i++ { I := float64(i) R := d.Vector{-x, -y, z - (lift + (I * Δ))} r := R.Len() B := R.Mul(TipCharge / (4 * math.Pi * r * r * r)) R = d.Vector{-x, -y, z - (lift + tipsize + (I * Δ))} r = R.Len() B = B.Add(R.Mul(-TipCharge / (4 * math.Pi * r * r * r))) E[i+1] = B.Dot(m) * volume // i=-1 stored in E[0] } dFdz_tip := ((E[0] - E[1]) + (E[2] - E[1])) / (Δ * Δ) // dFz/dz = d2E/dz2 K[s][zw][yw][xw] += float32(dFdz_tip) // += needed in case of PBC } } } } return kernel } 3-3.11.1/oommf/000077500000000000000000000000001503346766200130705ustar00rootroot000000000000003-3.11.1/oommf/Makefile000066400000000000000000000000241503346766200145240ustar00rootroot00000000000000all: go install -v 3-3.11.1/oommf/oommf.go000066400000000000000000000160141503346766200145360ustar00rootroot00000000000000// package oommf provides the OVF data format as used by OOMMF. package oommf import ( "bufio" "fmt" "io" "os" "strconv" "strings" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Read any OOMMF file, autodetect OVF1/OVF2 format func Read(in io.Reader) (s *data.Slice, meta data.Meta, err error) { //in := fullReader{bufio.NewReader(in_)} info := readHeader(in) n := info.Size c := info.StepSize if c == [3]float64{0, 0, 0} { c = [3]float64{1, 1, 1} // default (presumably unitless) cell size } data_ := data.NewSlice(info.NComp, n) format := strings.ToLower(info.Format) ovf := info.OVF switch { default: panic(fmt.Sprint("unknown format: OVF", ovf, " ", format)) case format == "text": readOVFDataText(in, data_) case format == "binary 4" && ovf == 1: readOVF1DataBinary4(in, data_) case format == "binary 8" && ovf == 1: readOVF1DataBinary8(in, data_) case format == "binary 4" && ovf == 2: readOVF2DataBinary4(in, data_) case format == "binary 8" && ovf == 2: readOVF2DataBinary8(in, data_) } return data_, data.Meta{Name: info.Title, Time: info.TotalTime, Unit: info.ValueUnit, CellSize: info.StepSize}, nil } func ReadFile(fname string) (*data.Slice, data.Meta, error) { f, err := os.Open(fname) if err != nil { return nil, data.Meta{}, err } defer f.Close() return Read(bufio.NewReader(f)) } func MustReadFile(fname string) (*data.Slice, data.Meta) { s, t, err := ReadFile(fname) util.FatalErr(err) return s, t } // oommf.Info represents the header part of an ovf file. // TODO: add Err to return error status // Perhaps CheckErr() func type Info struct { Desc map[string]interface{} Title string NComp int Size [3]int ValueMultiplier float32 ValueUnit string Format string // binary or text OVF int TotalTime float64 StageTime float64 SizeofFloat int // 4/8 StepSize [3]float64 MeshUnit string } // Parses the header part of the OVF1/OVF2 file func readHeader(in io.Reader) *Info { desc := make(map[string]interface{}) info := new(Info) info.Desc = desc line, eof := readLine(in) switch strings.ToLower(line) { default: panic("unknown header: " + line) case "# oommf ovf 2.0": info.OVF = 2 case "# oommf: rectangular mesh v1.0": info.OVF = 1 info.NComp = 3 // OVF1 only supports vector } line, eof = readLine(in) for !eof && !isHeaderEnd(line) { key, value := parseHeaderLine(line) switch strings.ToLower(key) { default: panic("Unknown key: " + key) // ignored case "oommf", "segment count", "begin", "meshtype", "xbase", "ybase", "zbase", "xmin", "ymin", "zmin", "xmax", "ymax", "zmax", "valuerangeminmag", "valuerangemaxmag", "end": // ignored (OVF1) case "", "valuelabels": // ignored (OVF2) case "title": info.Title = value case "valueunits": info.ValueUnit = strings.Split(value, " ")[0] // take unit of first component, we don't support per-component units case "valuedim": info.NComp = atoi(value) case "xnodes": info.Size[X] = atoi(value) case "ynodes": info.Size[Y] = atoi(value) case "znodes": info.Size[Z] = atoi(value) case "xstepsize": info.StepSize[X] = atof(value) case "ystepsize": info.StepSize[Y] = atof(value) case "zstepsize": info.StepSize[Z] = atof(value) case "valuemultiplier": case "valueunit": case "meshunit": // desc tags: parse further and add to metadata table case "desc": strs := strings.SplitN(value, ":", 2) desc_key := strings.Trim(strs[0], "# ") // Desc tag does not neccesarily have a key:value layout. // If not, we use an empty value string. desc_value := "" if len(strs) > 1 { desc_value = strings.Trim(strs[1], "# ") } desc[desc_key] = desc_value } line, eof = readLine(in) } // the remaining line should now be the begin:data clause key, value := parseHeaderLine(line) value = strings.TrimSpace(value) strs := strings.SplitN(value, " ", 3) if strings.ToLower(key) != "begin" || strings.ToLower(strs[0]) != "data" { panic("Expected: Begin: Data") } info.Format = strings.ToLower(strs[1]) if len(strs) >= 3 { // dataformat for text is empty info.Format = "binary " + strs[2] // binary + 4 or 8 } else { info.Format = "text" } // OVF1-style time info if t1, ok := info.Desc["Time (s)"]; ok { timestr := fmt.Sprint(t1) t, _ := strconv.ParseFloat(timestr, 64) info.TotalTime = t } // OVF2-style time info if t2, ok := info.Desc["Total simulation time"]; ok { timestr := fmt.Sprint(t2) words := strings.Split(timestr, " ") t, _ := strconv.ParseFloat(words[0], 64) info.TotalTime = t } return info } // INTERNAL: Splits "# key: value" into "key", "value". // Both may be empty func parseHeaderLine(str string) (key, value string) { //remove the comment first, I *hate* go for having slices like python //AND not allowing negative indexes comPos := strings.Index(str, "##") if comPos != -1 { str = str[:comPos] } //if line doesn't begin with # just propagate it to generate proper error messages if !strings.HasPrefix(str, "#") { return str, "" } //otherwise proceed to crunch line as normal //TODO: check about implementing proper white space character culling instead of just looking for spaces strs := strings.SplitN(str, ":", 2) key = strings.Trim(strs[0], "# ") if len(strs) != 2 { return key, "" } value = strings.Trim(strs[1], "# ") return key, value } // INTERNAL: true if line starts with "# begin:data" func isHeaderEnd(str string) bool { str = strings.ToLower(strings.Trim(str, "# ")) str = strings.Replace(str, " ", "", -1) return strings.HasPrefix(str, "begin:data") } const OVF_CONTROL_NUMBER_4 = 1234567.0 // The ovf format requires the first encoded number in the binary data section to be this control number const OVF_CONTROL_NUMBER_8 = 123456789012345.0 // read data block in text format, for OVF1 and OVF2 func readOVFDataText(in io.Reader, t *data.Slice) { size := t.Size() data := t.Tensors() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < t.NComp(); c++ { _, err := fmt.Fscan(in, &data[c][iz][iy][ix]) if err != nil { panic(err) } } } } } } // write data block in text format, for OVF1 and OVF2 func writeOVFText(out io.Writer, tens *data.Slice) (err error) { data := tens.Tensors() gridsize := tens.Size() ncomp := tens.NComp() // Here we loop over X,Y,Z, not Z,Y,X, because // internal in C-order == external in Fortran-order for iz := 0; iz < gridsize[Z]; iz++ { for iy := 0; iy < gridsize[Y]; iy++ { for ix := 0; ix < gridsize[X]; ix++ { for c := 0; c < ncomp; c++ { _, err = fmt.Fprint(out, data[c][iz][iy][ix], " ") } _, err = fmt.Fprint(out, "\n") } } } return } // Writes a header key/value pair to out: // # Key: Value func hdr(out io.Writer, key string, value ...interface{}) { _, err := fmt.Fprint(out, "# ", key, ": ") util.FatalErr(err) _, err = fmt.Fprintln(out, value...) util.FatalErr(err) } func dsc(out io.Writer, k, v interface{}) { hdr(out, "Desc", k, ": ", v) } 3-3.11.1/oommf/ovf1.go000066400000000000000000000106571503346766200143030ustar00rootroot00000000000000package oommf import ( "encoding/binary" "fmt" "io" "log" "strings" "unsafe" "github.com/mumax/3/data" ) func WriteOVF1(out io.Writer, q *data.Slice, meta data.Meta, dataformat string) { if q.NComp() != 3 { log.Fatal("Cannot save the quantity: the OVF1 format only supports 3D-vector fields. Use OVF2 instead.") } writeOVF1Header(out, q, meta) writeOVF1Data(out, q, dataformat) hdr(out, "End", "Segment") } func writeOVF1Data(out io.Writer, q *data.Slice, dataformat string) { canonicalFormat := "" switch strings.ToLower(dataformat) { case "text": canonicalFormat = "Text" hdr(out, "Begin", "Data "+canonicalFormat) writeOVFText(out, q) case "binary", "binary 4": canonicalFormat = "Binary 4" hdr(out, "Begin", "Data "+canonicalFormat) writeOVF1Binary4(out, q) default: log.Fatalf("Illegal OVF data format: %v. Options are: Text, Binary 4", dataformat) } hdr(out, "End", "Data "+canonicalFormat) } // Writes the OVF header func writeOVF1Header(out io.Writer, q *data.Slice, meta data.Meta) { gridsize := q.Size() cellsize := meta.CellSize hdr(out, "OOMMF", "rectangular mesh v1.0") hdr(out, "Segment count", "1") hdr(out, "Begin", "Segment") hdr(out, "Begin", "Header") dsc(out, "Time (s)", meta.Time) hdr(out, "Title", meta.Name) hdr(out, "meshtype", "rectangular") hdr(out, "meshunit", "m") hdr(out, "xbase", cellsize[X]/2) hdr(out, "ybase", cellsize[Y]/2) hdr(out, "zbase", cellsize[Z]/2) hdr(out, "xstepsize", cellsize[X]) hdr(out, "ystepsize", cellsize[Y]) hdr(out, "zstepsize", cellsize[Z]) hdr(out, "xmin", 0) hdr(out, "ymin", 0) hdr(out, "zmin", 0) hdr(out, "xmax", cellsize[X]*float64(gridsize[X])) hdr(out, "ymax", cellsize[Y]*float64(gridsize[Y])) hdr(out, "zmax", cellsize[Z]*float64(gridsize[Z])) hdr(out, "xnodes", gridsize[X]) hdr(out, "ynodes", gridsize[Y]) hdr(out, "znodes", gridsize[Z]) hdr(out, "ValueRangeMinMag", 1e-08) // not so "optional" as the OOMMF manual suggests... hdr(out, "ValueRangeMaxMag", 1) // TODO hdr(out, "valueunit", meta.Unit) hdr(out, "valuemultiplier", 1) hdr(out, "End", "Header") } // Writes data in OVF Binary 4 format func writeOVF1Binary4(out io.Writer, array *data.Slice) (err error) { data := array.Tensors() gridsize := array.Size() var bytes []byte // OOMMF requires this number to be first to check the format var controlnumber float32 = OVF_CONTROL_NUMBER_4 // Conversion from float32 [4]byte in big-endian // Inlined for performance, terabytes of data will pass here... bytes = (*[4]byte)(unsafe.Pointer(&controlnumber))[:] bytes[0], bytes[1], bytes[2], bytes[3] = bytes[3], bytes[2], bytes[1], bytes[0] // swap endianness _, err = out.Write(bytes) ncomp := array.NComp() for iz := 0; iz < gridsize[Z]; iz++ { for iy := 0; iy < gridsize[Y]; iy++ { for ix := 0; ix < gridsize[X]; ix++ { for c := 0; c < ncomp; c++ { // dirty conversion from float32 to [4]byte bytes = (*[4]byte)(unsafe.Pointer(&data[c][iz][iy][ix]))[:] bytes[0], bytes[1], bytes[2], bytes[3] = bytes[3], bytes[2], bytes[1], bytes[0] out.Write(bytes) } } } } return } func readOVF1DataBinary4(in io.Reader, t *data.Slice) { size := t.Size() data := t.Tensors() // OOMMF requires this number to be first to check the format var controlnumber float32 // OVF 1.0 is network byte order (MSB) binary.Read(in, binary.BigEndian, &controlnumber) if controlnumber != OVF_CONTROL_NUMBER_4 { panic("invalid OVF1 control number: " + fmt.Sprint(controlnumber)) } var tmp float32 for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < 3; c++ { err := binary.Read(in, binary.BigEndian, &tmp) if err != nil { panic(err) } data[c][iz][iy][ix] = tmp } } } } } func readOVF1DataBinary8(in io.Reader, t *data.Slice) { size := t.Size() data := t.Tensors() // OOMMF requires this number to be first to check the format var controlnumber float64 // OVF 1.0 is network byte order (MSB) binary.Read(in, binary.BigEndian, &controlnumber) if controlnumber != OVF_CONTROL_NUMBER_8 { panic("invalid OVF1 control number: " + fmt.Sprint(controlnumber)) } var tmp float64 for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < 3; c++ { err := binary.Read(in, binary.BigEndian, &tmp) if err != nil { panic(err) } data[c][iz][iy][ix] = float32(tmp) } } } } } 3-3.11.1/oommf/ovf2.go000066400000000000000000000111141503346766200142710ustar00rootroot00000000000000package oommf import ( "fmt" "io" "log" "strings" "unsafe" "github.com/mumax/3/data" ) func WriteOVF2(out io.Writer, q *data.Slice, meta data.Meta, dataformat string) { writeOVF2Header(out, q, meta) writeOVF2Data(out, q, dataformat) hdr(out, "End", "Segment") } func writeOVF2Header(out io.Writer, q *data.Slice, meta data.Meta) { gridsize := q.Size() cellsize := meta.CellSize fmt.Fprintln(out, "# OOMMF OVF 2.0") hdr(out, "Segment count", "1") hdr(out, "Begin", "Segment") hdr(out, "Begin", "Header") hdr(out, "Title", meta.Name) hdr(out, "meshtype", "rectangular") hdr(out, "meshunit", "m") hdr(out, "xmin", 0) hdr(out, "ymin", 0) hdr(out, "zmin", 0) hdr(out, "xmax", cellsize[X]*float64(gridsize[X])) hdr(out, "ymax", cellsize[Y]*float64(gridsize[Y])) hdr(out, "zmax", cellsize[Z]*float64(gridsize[Z])) name := meta.Name var labels []interface{} if q.NComp() == 1 { labels = []interface{}{name} } else { for i := 0; i < q.NComp(); i++ { labels = append(labels, name+"_"+string('x'+i)) } } hdr(out, "valuedim", q.NComp()) hdr(out, "valuelabels", labels...) // TODO unit := meta.Unit if unit == "" { unit = "1" } if q.NComp() == 1 { hdr(out, "valueunits", unit) } else { hdr(out, "valueunits", unit, unit, unit) } // We don't really have stages //fmt.Fprintln(out, "# Desc: Stage simulation time: ", meta.TimeStep, " s") // TODO hdr(out, "Desc", "Total simulation time: ", meta.Time, " s") hdr(out, "xbase", cellsize[X]/2) hdr(out, "ybase", cellsize[Y]/2) hdr(out, "zbase", cellsize[Z]/2) hdr(out, "xnodes", gridsize[X]) hdr(out, "ynodes", gridsize[Y]) hdr(out, "znodes", gridsize[Z]) hdr(out, "xstepsize", cellsize[X]) hdr(out, "ystepsize", cellsize[Y]) hdr(out, "zstepsize", cellsize[Z]) hdr(out, "End", "Header") } func writeOVF2Data(out io.Writer, q *data.Slice, dataformat string) { canonicalFormat := "" switch strings.ToLower(dataformat) { case "text": canonicalFormat = "Text" hdr(out, "Begin", "Data "+canonicalFormat) writeOVFText(out, q) case "binary", "binary 4": canonicalFormat = "Binary 4" hdr(out, "Begin", "Data "+canonicalFormat) writeOVF2DataBinary4(out, q) default: log.Fatalf("Illegal OMF data format: %v. Options are: Text, Binary 4", dataformat) } hdr(out, "End", "Data "+canonicalFormat) } func writeOVF2DataBinary4(out io.Writer, array *data.Slice) { //w.count(w.out.Write((*(*[1<<31 - 1]byte)(unsafe.Pointer(&list[0])))[0 : 4*len(list)])) // (shortcut) data := array.Tensors() size := array.Size() var bytes []byte // OOMMF requires this number to be first to check the format var controlnumber float32 = OVF_CONTROL_NUMBER_4 bytes = (*[4]byte)(unsafe.Pointer(&controlnumber))[:] out.Write(bytes) ncomp := array.NComp() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < ncomp; c++ { bytes = (*[4]byte)(unsafe.Pointer(&data[c][iz][iy][ix]))[:] out.Write(bytes) } } } } } func readOVF2DataBinary4(in io.Reader, array *data.Slice) { size := array.Size() data := array.Tensors() // OOMMF requires this number to be first to check the format controlnumber := readFloat32(in) if controlnumber != OVF_CONTROL_NUMBER_4 { panic("invalid OVF2 control number: " + fmt.Sprint(controlnumber)) } ncomp := array.NComp() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < ncomp; c++ { data[c][iz][iy][ix] = readFloat32(in) } } } } } // fully read buf, panic on error func readFull(in io.Reader, buf []byte) { _, err := io.ReadFull(in, buf) if err != nil { panic(err) } return } // read float32 in machine endianness, panic on error func readFloat32(in io.Reader) float32 { var bytes4 [4]byte bytes := bytes4[:] readFull(in, bytes) return *((*float32)(unsafe.Pointer(&bytes4))) } // read float64 in machine endianness, panic on error func readFloat64(in io.Reader) float64 { var bytes8 [8]byte bytes := bytes8[:] readFull(in, bytes) return *((*float64)(unsafe.Pointer(&bytes8))) } func readOVF2DataBinary8(in io.Reader, array *data.Slice) { size := array.Size() data := array.Tensors() // OOMMF requires this number to be first to check the format controlnumber := readFloat64(in) if controlnumber != OVF_CONTROL_NUMBER_8 { panic("invalid OVF2 control number: " + fmt.Sprint(controlnumber)) } ncomp := array.NComp() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < ncomp; c++ { data[c][iz][iy][ix] = float32(readFloat64(in)) } } } } } 3-3.11.1/oommf/util.go000066400000000000000000000021551503346766200143770ustar00rootroot00000000000000package oommf import ( "io" "strconv" ) func readLine(in io.Reader) (line string, eof bool) { char := readChar(in) eof = isEOF(char) for !isEndline(char) { line += string(byte(char)) char = readChar(in) } return line, eof } func isEOF(char int) bool { return char == -1 } func isEndline(char int) bool { return isEOF(char) || char == int('\n') } //// Blocks until all requested bytes are read. //type fullReader struct{ io.Reader } // //func (r fullReader) Read(p []byte) (n int, err error) { // return io.ReadFull(r.Reader, p) //} // Reads one character from the Reader. // -1 means EOF. // Errors are cought and cause panic func readChar(in io.Reader) int { buffer := [1]byte{} switch nr, err := in.Read(buffer[0:]); true { case nr < 0: // error panic(err) case nr == 0: // eof return -1 case nr > 0: // ok return int(buffer[0]) } panic("unreachable") } func atoi(a string) int { i, err := strconv.Atoi(a) if err != nil { panic(err) } return i } func atof(a string) float64 { i, err := strconv.ParseFloat(a, 64) if err != nil { panic(err) } return i } const ( X = 0 Y = 1 Z = 2 ) 3-3.11.1/post-commit000077500000000000000000000010251503346766200141520ustar00rootroot00000000000000#!/bin/sh # # A hook script to verify what is about to be committed. # Called by git-commit with no arguments. The hook should # exit with non-zero status after issuing an appropriate message if # it wants to stop the commit. # # Add this file to .git/hooks # Run all unit tests echo Running unit tests in background rm test.log -rf (if (make test >> test.log 2>> test.log); then notify-send "Unit tests passed" 2> /dev/null exit 0; else notify-send "Unit tests failed" 2> /dev/null cat test.log; rm test.log; exit 2; fi;)& 3-3.11.1/pre-commit000077500000000000000000000007611503346766200137610ustar00rootroot00000000000000#!/bin/sh # # A hook script to verify what is about to be committed. # Called by git-commit with no arguments. The hook should # exit with non-zero status after issuing an appropriate message if # it wants to stop the commit. # # Add this file to .git/hooks # # Runs gofmt on the code and stops commit if files were affected. # fail=0; if (gofmt -w -l */*.go */*/*.go | grep \.go); then exit 1; fi; make || exit 1 #if astyle --indent=tab cuda/*.cu | grep Formatted; then exit 1; fi; exit 0 3-3.11.1/script/000077500000000000000000000000001503346766200132575ustar00rootroot000000000000003-3.11.1/script/Makefile000066400000000000000000000000241503346766200147130ustar00rootroot00000000000000all: go install -v 3-3.11.1/script/assignstmt.go000066400000000000000000000037711503346766200160120ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) // compiles a (single) assign statement lhs = rhs func (w *World) compileAssignStmt(a *ast.AssignStmt) Expr { if len(a.Lhs) != 1 || len(a.Rhs) != 1 { panic(err(a.Pos(), "multiple assignment not allowed")) } lhs, rhs := a.Lhs[0], a.Rhs[0] r := w.compileExpr(rhs) switch a.Tok { default: panic(err(a.Pos(), a.Tok, "not allowed")) case token.ASSIGN: // = return w.compileAssign(a, lhs, r) case token.DEFINE: // := return w.compileDefine(a, lhs, r) case token.ADD_ASSIGN: // += return w.compileAddAssign(a, lhs, r) case token.SUB_ASSIGN: // -= return w.compileSubAssign(a, lhs, r) } } // compile a = b func (w *World) compileAssign(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { l := w.compileLvalue(lhs) return &assignStmt{lhs: l, rhs: typeConv(a.Pos(), r, inputType(l))} } // compile a := b func (w *World) compileDefine(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { ident, ok := lhs.(*ast.Ident) if !ok { panic(err(a.Pos(), "non-name on left side of :=")) } addr := reflect.New(r.Type()) ok = w.safeDeclare(ident.Name, &reflectLvalue{addr.Elem()}) if !ok { panic(err(a.Pos(), "already defined: "+ident.Name)) } return w.compileAssign(a, lhs, r) } type assignStmt struct { lhs LValue rhs Expr void } func (a *assignStmt) Eval() interface{} { a.lhs.SetValue(a.rhs.Eval()) return nil } func (a *assignStmt) Child() []Expr { return []Expr{a.lhs, a.rhs} } func (w *World) compileAddAssign(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { l := w.compileLvalue(lhs) x := typeConv(a.Pos(), l, float64_t) y := typeConv(a.Pos(), r, float64_t) sum := &add{binaryExpr{x, y}} return &assignStmt{lhs: l, rhs: typeConv(a.Pos(), sum, inputType(l))} } func (w *World) compileSubAssign(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { l := w.compileLvalue(lhs) x := typeConv(a.Pos(), l, float64_t) y := typeConv(a.Pos(), r, float64_t) sub := &sub{binaryExpr{x, y}} return &assignStmt{lhs: l, rhs: typeConv(a.Pos(), sub, inputType(l))} } 3-3.11.1/script/binaryexpr.go000066400000000000000000000077431503346766200160040ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) // compiles a binary expression x 'op' y func (w *World) compileBinaryExpr(n *ast.BinaryExpr) Expr { switch n.Op { default: panic(err(n.Pos(), "not allowed:", n.Op)) case token.ADD: return &add{w.newBinExpr(n)} case token.SUB: return &sub{w.newBinExpr(n)} case token.MUL: return &mul{w.newBinExpr(n)} case token.QUO: return &quo{w.newBinExpr(n)} case token.LSS: return &lss{w.newComp(n)} case token.GTR: return >r{w.newComp(n)} case token.LEQ: return &leq{w.newComp(n)} case token.GEQ: return &geq{w.newComp(n)} case token.EQL: return &eql{w.newComp(n)} case token.NEQ: return &neq{w.newComp(n)} case token.LAND: return &and{w.newBoolOp(n)} case token.LOR: return &or{w.newBoolOp(n)} } } // abstract superclass for all binary expressions type binaryExpr struct{ x, y Expr } func (w *World) newBinExpr(n *ast.BinaryExpr) binaryExpr { x := typeConv(n.Pos(), w.compileExpr(n.X), float64_t) y := typeConv(n.Pos(), w.compileExpr(n.Y), float64_t) return binaryExpr{x, y} } func (b *binaryExpr) Type() reflect.Type { return float64_t } func (b *binaryExpr) Child() []Expr { return []Expr{b.x, b.y} } type add struct{ binaryExpr } type sub struct{ binaryExpr } type mul struct{ binaryExpr } type quo struct{ binaryExpr } func (b *add) Eval() interface{} { return b.x.Eval().(float64) + b.y.Eval().(float64) } func (b *sub) Eval() interface{} { return b.x.Eval().(float64) - b.y.Eval().(float64) } func (b *mul) Eval() interface{} { return b.x.Eval().(float64) * b.y.Eval().(float64) } func (b *quo) Eval() interface{} { return b.x.Eval().(float64) / b.y.Eval().(float64) } func (b *add) Fix() Expr { return &add{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } func (b *sub) Fix() Expr { return &sub{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } func (b *mul) Fix() Expr { return &mul{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } func (b *quo) Fix() Expr { return &quo{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } type comp binaryExpr func (w *World) newComp(n *ast.BinaryExpr) comp { return comp(w.newBinExpr(n)) } func (b *comp) Type() reflect.Type { return bool_t } func (b *comp) Child() []Expr { return []Expr{b.x, b.y} } type lss struct{ comp } type gtr struct{ comp } type leq struct{ comp } type geq struct{ comp } type eql struct{ comp } type neq struct{ comp } func (b *lss) Eval() interface{} { return b.x.Eval().(float64) < b.y.Eval().(float64) } func (b *gtr) Eval() interface{} { return b.x.Eval().(float64) > b.y.Eval().(float64) } func (b *leq) Eval() interface{} { return b.x.Eval().(float64) <= b.y.Eval().(float64) } func (b *geq) Eval() interface{} { return b.x.Eval().(float64) >= b.y.Eval().(float64) } func (b *eql) Eval() interface{} { return b.x.Eval().(float64) == b.y.Eval().(float64) } func (b *neq) Eval() interface{} { return b.x.Eval().(float64) != b.y.Eval().(float64) } func (b *lss) Fix() Expr { return &lss{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *gtr) Fix() Expr { return >r{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *leq) Fix() Expr { return &leq{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *geq) Fix() Expr { return &geq{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *eql) Fix() Expr { return &eql{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *neq) Fix() Expr { return &neq{comp{x: b.x.Fix(), y: b.y.Fix()}} } type boolOp struct{ x, y Expr } func (w *World) newBoolOp(n *ast.BinaryExpr) boolOp { x := typeConv(n.Pos(), w.compileExpr(n.X), bool_t) y := typeConv(n.Pos(), w.compileExpr(n.Y), bool_t) return boolOp{x, y} } func (b *boolOp) Child() []Expr { return []Expr{b.x, b.y} } func (b *boolOp) Type() reflect.Type { return bool_t } type and struct{ boolOp } type or struct{ boolOp } func (b *and) Eval() interface{} { return b.x.Eval().(bool) && b.y.Eval().(bool) } func (b *or) Eval() interface{} { return b.x.Eval().(bool) || b.y.Eval().(bool) } func (b *and) Fix() Expr { return &and{boolOp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *or) Fix() Expr { return &or{boolOp{x: b.x.Fix(), y: b.y.Fix()}} } 3-3.11.1/script/blockstmt.go000066400000000000000000000024271503346766200156150ustar00rootroot00000000000000package script import ( "bytes" "fmt" "go/ast" "go/format" "go/token" "reflect" "strings" ) // block statement is a list of statements. type BlockStmt struct { Children []Expr Node []ast.Node } // does not enter scope because it does not necessarily need to (e.g. for, if). func (w *World) compileBlockStmt_noScope(n *ast.BlockStmt) *BlockStmt { b := &BlockStmt{} for _, s := range n.List { b.append(w.compileStmt(s), s) } return b } func (b *BlockStmt) append(s Expr, n ast.Node) { b.Children = append(b.Children, s) b.Node = append(b.Node, n) } func (b *BlockStmt) Eval() interface{} { for _, s := range b.Children { s.Eval() } return nil } func (b *BlockStmt) Type() reflect.Type { return nil } func (b *BlockStmt) Child() []Expr { return b.Children } func Format(n ast.Node) string { var buf bytes.Buffer fset := token.NewFileSet() format.Node(&buf, fset, n) str := buf.String() if strings.HasSuffix(str, "\n") { str = str[:len(str)-1] } return str } func (b *BlockStmt) Format() string { var buf bytes.Buffer fset := token.NewFileSet() for i := range b.Children { format.Node(&buf, fset, b.Node[i]) fmt.Fprintln(&buf) } return buf.String() } func (b *BlockStmt) Fix() Expr { return &BlockStmt{Children: fixExprs(b.Children), Node: b.Node} } 3-3.11.1/script/call.go000066400000000000000000000040161503346766200145220ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) type call struct { f Expr args []Expr } func (w *World) compileCallExpr(n *ast.CallExpr) Expr { // compile function or method to be called var f Expr var fname string switch Fun := n.Fun.(type) { default: panic(err(n.Pos(), "not allowed:", typ(n.Fun))) case *ast.Ident: // function call fname = Fun.Name if fname == "source" { return w.compileSource(n) } f = w.compileExpr(Fun) case *ast.SelectorExpr: // method call f = w.compileSelectorStmt(Fun) fname = Fun.Sel.Name } if f.Type().Kind() != reflect.Func { panic(err(n.Pos(), "can not call", Format(n))) } // compile and check args args := make([]Expr, len(n.Args)) variadic := f.Type().IsVariadic() if !variadic && len(n.Args) != f.Type().NumIn() { panic(err(n.Pos(), fname, "needs", f.Type().NumIn(), "arguments, got", len(n.Args))) // TODO: varargs } for i := range args { if variadic { args[i] = w.compileExpr(n.Args[i]) // no type check or conversion } else { args[i] = typeConv(n.Args[i].Pos(), w.compileExpr(n.Args[i]), f.Type().In(i)) } } return &call{f, args} } func (c *call) Eval() interface{} { // evaluate and pack arguments argv := make([]reflect.Value, len(c.args)) for i := range c.args { argv[i] = reflect.ValueOf(c.args[i].Eval()) } // evaluate function f := reflect.ValueOf(c.f.Eval()) // call ret := f.Call(argv) // at most 1 return value allowed assert(len(ret) <= 1) if len(ret) == 0 { return nil } else { return ret[0].Interface() } } func (c *call) Child() []Expr { return append([]Expr{c.f}, c.args...) } // return type of call func (c *call) Type() reflect.Type { switch c.f.Type().NumOut() { case 0: return nil // "void" case 1: return c.f.Type().Out(0) default: panic("bug: multiple return values not allowed") } } func (c *call) Fix() Expr { return &call{f: c.f, args: fixExprs(c.args)} } // apply .Fix() to all elements func fixExprs(e []Expr) []Expr { f := make([]Expr, len(e)) for i := range f { f[i] = e[i].Fix() } return f } 3-3.11.1/script/child.go000066400000000000000000000003471503346766200146750ustar00rootroot00000000000000package script func Contains(tree, search Expr) bool { if tree == search { return true } else { children := tree.Child() for _, e := range children { if Contains(e, search) { return true } } } return false } 3-3.11.1/script/compile.go000066400000000000000000000036651503346766200152500ustar00rootroot00000000000000package script import ( "fmt" "go/ast" "go/parser" ) // Compiles an expression, which can then be evaluated. E.g.: // // expr, err := world.CompileExpr("1+1") // expr.Eval() // returns 2 func (w *World) CompileExpr(src string) (code Expr, e error) { // parse tree, err := parser.ParseExpr(src) if err != nil { return nil, fmt.Errorf(`parse "%s": %v`, src, err) } if Debug { ast.Print(nil, tree) } // catch compile errors if !Debug { defer func() { err := recover() if err == nil { return } if er, ok := err.(*compileErr); ok { code = nil e = fmt.Errorf(`parse "%s": %v`, src, er) } else { panic(err) } }() } return w.compile(tree), nil } // CompileExpr with panic on error. func (w *World) MustCompileExpr(src string) Expr { code, err := w.CompileExpr(src) if err != nil { panic(err) } return code } // compiles source consisting of a number of statements. E.g.: // // src = "a = 1; b = sin(x)" // code, err := world.Compile(src) // code.Eval() func (w *World) Compile(src string) (code *BlockStmt, e error) { // parse exprSrc := "func(){\n" + src + "\n}" // wrap in func to turn into expression tree, err := parser.ParseExpr(exprSrc) if err != nil { return nil, fmt.Errorf("script line %v: ", err) } // catch compile errors and decode line number if !Debug { defer func() { err := recover() if err == nil { return } if compErr, ok := err.(*compileErr); ok { code = nil e = fmt.Errorf("script %v: %v", pos2line(compErr.pos, exprSrc), compErr.msg) } else { panic(err) } }() } // compile stmts := tree.(*ast.FuncLit).Body.List // strip func again if Debug { ast.Print(nil, stmts) } block := new(BlockStmt) for _, s := range stmts { block.append(w.compile(s), s) } return block, nil } // Like Compile but panics on error func (w *World) MustCompile(src string) Expr { code, err := w.Compile(src) if err != nil { panic(err) } return code } 3-3.11.1/script/const.go000066400000000000000000000006011503346766200147310ustar00rootroot00000000000000package script import "reflect" type Const struct { value interface{} typ reflect.Type } func NewConst(e Expr) *Const { return &Const{value: e.Eval(), typ: e.Type()} } func (c *Const) Eval() interface{} { return c.value } func (c *Const) Type() reflect.Type { return c.typ } func (c *Const) Child() []Expr { return nil } func (c *Const) Fix() Expr { return c } 3-3.11.1/script/error.go000066400000000000000000000024611503346766200147420ustar00rootroot00000000000000package script import ( "fmt" "go/token" "reflect" "strings" ) var Debug = false // print debug info? // compileErr, and only compileErr will be caught by Compile and returned as an error. type compileErr struct { pos token.Pos msg string } // implements error func (c *compileErr) Error() string { return c.msg } // constructs a compileErr func err(pos token.Pos, msg ...interface{}) *compileErr { str := fmt.Sprintln(msg...) // use Sprintln to insert spaces str = str[:len(str)-1] // strip final \n return &compileErr{pos, str} } // type string for value i func typ(i interface{}) string { typ := reflect.TypeOf(reflect.ValueOf(i).Interface()).String() if strings.HasPrefix(typ, "*ast.") { typ = typ[len("*ast."):] } return typ } func assert(test bool) { if !test { panic("assertion failed") } } // decodes a token position in source to a line number // and returns the line number + line code. func pos2line(pos token.Pos, src string) string { if pos == 0 { return "" } lines := strings.Split(src, "\n") line := 0 for i, b := range src { if token.Pos(i) == pos { return fmt.Sprint("line ", line, ": ", strings.Trim(lines[line], " \t")) // func{ prefix makes lines count from 1 } if b == '\n' { line++ } } return fmt.Sprint("position", pos) // we should not reach this } 3-3.11.1/script/exec.go000066400000000000000000000013621503346766200145340ustar00rootroot00000000000000package script // Exec compiles and executes the source statements. func (w *World) Exec(src string) error { code, err := w.Compile(src) if err != nil { return err } code.Eval() return nil } // Exec with panic on error. func (w *World) MustExec(src string) { code := w.MustCompile(src) code.Eval() } // Eval with panic on error. func (w *World) MustEval(src string) interface{} { Expr := w.MustCompileExpr(src) return Expr.Eval() } // Eval compiles and evaluates src, which must be an expression, and returns the result(s). E.g.: // // world.Eval("1+1") // returns 2, nil func (w *World) Eval(src string) (ret interface{}, err error) { Expr, err := w.CompileExpr(src) if err != nil { return nil, err } return Expr.Eval(), nil } 3-3.11.1/script/expr.go000066400000000000000000000015231503346766200145650ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) // an expression can be evaluated type Expr interface { Eval() interface{} // evaluate and return result (nil for void) Type() reflect.Type // return type, nil for void Child() []Expr Fix() Expr // replace all variables by their current value, except for the time "t". } // compiles an expression func (w *World) compileExpr(e ast.Expr) Expr { switch e := e.(type) { default: panic(err(e.Pos(), "not allowed:", typ(e))) case *ast.Ident: return w.resolve(e.Pos(), e.Name) case *ast.BasicLit: return w.compileBasicLit(e) case *ast.BinaryExpr: return w.compileBinaryExpr(e) case *ast.UnaryExpr: return w.compileUnaryExpr(e) case *ast.CallExpr: return w.compileCallExpr(e) case *ast.ParenExpr: return w.compileExpr(e.X) case *ast.IndexExpr: return w.compileIndexExpr(e) } } 3-3.11.1/script/for.go000066400000000000000000000017631503346766200144030ustar00rootroot00000000000000package script import ( "go/ast" ) // for statement type forStmt struct { init, cond, post, body Expr void } func (b *forStmt) Eval() interface{} { for b.init.Eval(); b.cond.Eval().(bool); b.post.Eval() { b.body.Eval() } return nil // void } func (w *World) compileForStmt(n *ast.ForStmt) *forStmt { w.EnterScope() defer w.ExitScope() stmt := &forStmt{init: &nop{}, cond: &nop{}, post: &nop{}, body: &nop{}} if n.Init != nil { stmt.init = w.compileStmt(n.Init) } if n.Cond != nil { stmt.cond = typeConv(n.Cond.Pos(), w.compileExpr(n.Cond), bool_t) } else { stmt.cond = boolLit(true) } if n.Post != nil { stmt.post = w.compileStmt(n.Post) } if n.Body != nil { stmt.body = w.compileBlockStmt_noScope(n.Body) } return stmt } type nop struct{ void } func (e *nop) Child() []Expr { return nil } func (e *nop) Eval() interface{} { return nil } func (e *nop) Fix() Expr { return e } func (e *forStmt) Child() []Expr { return []Expr{e.init, e.cond, e.post, e.body} } 3-3.11.1/script/funcif.go000066400000000000000000000017341503346766200150650ustar00rootroot00000000000000package script // Here be dragons import ( "github.com/mumax/3/data" "reflect" ) type ScalarFunction interface { Expr Float() float64 } // converts float64 to ScalarFunction type scalFn struct{ in Expr } func (c *scalFn) Eval() interface{} { return c } func (c *scalFn) Type() reflect.Type { return ScalarFunction_t } func (c *scalFn) Float() float64 { return c.in.Eval().(float64) } func (c *scalFn) Child() []Expr { return []Expr{c.in} } func (c *scalFn) Fix() Expr { return &scalFn{in: c.in.Fix()} } type VectorFunction interface { Expr Float3() data.Vector } // converts data.Vector to VectorFunction type vecFn struct{ in Expr } func (c *vecFn) Eval() interface{} { return c } func (c *vecFn) Type() reflect.Type { return VectorFunction_t } func (c *vecFn) Float3() data.Vector { return c.in.Eval().(data.Vector) } func (c *vecFn) Child() []Expr { return []Expr{c.in} } func (c *vecFn) Fix() Expr { return &vecFn{in: c.in.Fix()} } 3-3.11.1/script/function.go000066400000000000000000000014761503346766200154430ustar00rootroot00000000000000package script import ( "fmt" "reflect" ) type function struct { reflect.Value } func newFunction(fn interface{}) *function { val := reflect.ValueOf(fn) if val.Type().Kind() != reflect.Func { panic(fmt.Errorf("not a function: %v", val.Type())) } if val.Type().NumOut() > 1 { panic(fmt.Errorf("multiple return values not allowed: %v", val.Type())) } return &function{val} } // type of the function itself (when not called) func (f *function) Type() reflect.Type { return f.Value.Type() } func (f *function) NumIn() int { return f.Type().NumIn() } func (f *function) In(i int) reflect.Type { return f.Type().In(i) } func (f *function) Eval() interface{} { return f.Value.Interface() } func (f *function) Child() []Expr { return nil } func (f *function) Fix() Expr { return f } 3-3.11.1/script/if.go000066400000000000000000000012741503346766200142100ustar00rootroot00000000000000package script import ( "go/ast" ) // if statement type ifStmt struct { cond, body, else_ Expr void } func (b *ifStmt) Eval() interface{} { if b.cond.Eval().(bool) { b.body.Eval() } else { if b.else_ != nil { b.else_.Eval() } } return nil // void } func (w *World) compileIfStmt(n *ast.IfStmt) *ifStmt { w.EnterScope() defer w.ExitScope() stmt := &ifStmt{ cond: typeConv(n.Cond.Pos(), w.compileExpr(n.Cond), bool_t), body: w.compileBlockStmt_noScope(n.Body)} if n.Else != nil { stmt.else_ = w.compileStmt(n.Else) } return stmt } func (e *ifStmt) Child() []Expr { child := []Expr{e.cond, e.body, e.else_} if e.else_ == nil { child = child[:2] } return child } 3-3.11.1/script/incdecstmt.go000066400000000000000000000016711503346766200157500ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) func (w *World) compileIncDecStmt(n *ast.IncDecStmt) Expr { l := w.compileLvalue(n.X) switch n.Tok { case token.INC: rhs_plus1 := &addone{incdec{typeConv(n.Pos(), l, float64_t)}} return &assignStmt{lhs: l, rhs: typeConv(n.Pos(), rhs_plus1, l.Type())} case token.DEC: rhs_minus1 := &subone{incdec{typeConv(n.Pos(), l, float64_t)}} return &assignStmt{lhs: l, rhs: typeConv(n.Pos(), rhs_minus1, l.Type())} default: panic(err(n.Pos(), "not allowed:", n.Tok)) } } type incdec struct{ x Expr } func (e *incdec) Type() reflect.Type { return float64_t } func (e *incdec) Child() []Expr { return []Expr{e.x} } func (e *incdec) Fix() Expr { panic(invalid_closure) } type addone struct{ incdec } type subone struct{ incdec } func (s *addone) Eval() interface{} { return s.x.Eval().(float64) + 1 } func (s *subone) Eval() interface{} { return s.x.Eval().(float64) - 1 } 3-3.11.1/script/index.go000066400000000000000000000013351503346766200147170ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) func (w *World) compileIndexExpr(n *ast.IndexExpr) Expr { x := w.compileExpr(n.X) kind := x.Type().Kind() if !(kind == reflect.Array || kind == reflect.Slice) { panic(err(n.Pos(), "can not index", x.Type())) } i := typeConv(n.Index.Pos(), w.compileExpr(n.Index), int_t) return &index{x, i} } type index struct { x, index Expr } func (e *index) Type() reflect.Type { return e.x.Type().Elem() } func (e *index) Eval() interface{} { x := reflect.ValueOf(e.x.Eval()) i := e.index.Eval().(int) return x.Index(i).Interface() } func (e *index) Child() []Expr { return []Expr{e.x, e.index} } func (e *index) Fix() Expr { return &index{x: e.x.Fix(), index: e.index.Fix()} } 3-3.11.1/script/lit.go000066400000000000000000000033321503346766200143770ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" "strconv" ) // compiles a basic literal, like numbers and strings func (w *World) compileBasicLit(n *ast.BasicLit) Expr { switch n.Kind { default: panic(err(n.Pos(), "not allowed:", n.Value, "(", typ(n), ")")) case token.FLOAT: return floatLit(parseFloat(n.Value)) case token.INT: return intLit(parseInt(n.Value)) case token.STRING: return stringLit(n.Value[1 : len(n.Value)-1]) // remove quotes } } type floatLit float64 func (l floatLit) Eval() interface{} { return float64(l) } func (l floatLit) Type() reflect.Type { return float64_t } func (l floatLit) Child() []Expr { return nil } func (l floatLit) Fix() Expr { return l } type intLit int func (l intLit) Eval() interface{} { return int(l) } func (l intLit) Type() reflect.Type { return int_t } func (l intLit) Child() []Expr { return nil } func (l intLit) Fix() Expr { return l } type stringLit string func (l stringLit) Eval() interface{} { return string(l) } func (l stringLit) Type() reflect.Type { return string_t } func (l stringLit) Child() []Expr { return nil } func (l stringLit) Fix() Expr { return l } type boolLit bool func (l boolLit) Eval() interface{} { return bool(l) } func (l boolLit) Type() reflect.Type { return bool_t } func (l boolLit) Child() []Expr { return nil } func (l boolLit) Fix() Expr { return l } func parseFloat(str string) float64 { v, err := strconv.ParseFloat(str, 64) if err != nil { panic("internal error") // we were sure it was a number... } return v } func parseInt(str string) int { v, err := strconv.Atoi(str) if err != nil { panic("internal error") // we were sure it was a number... } return v } 3-3.11.1/script/lvalue.go000066400000000000000000000025301503346766200150760ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) // left-hand value in (single) assign statement type LValue interface { Expr SetValue(interface{}) // assigns a new value } func (w *World) compileLvalue(lhs ast.Node) LValue { switch lhs := lhs.(type) { default: panic(err(lhs.Pos(), "cannot assign to", typ(lhs))) case *ast.Ident: if l, ok := w.resolve(lhs.Pos(), lhs.Name).(LValue); ok { return l } else { panic(err(lhs.Pos(), "cannot assign to", lhs.Name)) } } } type reflectLvalue struct { elem reflect.Value } // general lvalue implementation using reflect. // lhs must be settable, e.g. address of something: // // var x float64 // newReflectLValue(&x) func newReflectLvalue(addr interface{}) LValue { elem := reflect.ValueOf(addr).Elem() if elem.Kind() == 0 { panic("variable/constant needs to be passed as pointer to addressable value") } return &reflectLvalue{elem} } func (l *reflectLvalue) Eval() interface{} { return l.elem.Interface() } func (l *reflectLvalue) Type() reflect.Type { return l.elem.Type() } func (l *reflectLvalue) SetValue(rvalue interface{}) { l.elem.Set(reflect.ValueOf(rvalue)) } func (l *reflectLvalue) Child() []Expr { return nil } func (l *reflectLvalue) Fix() Expr { return NewConst(l) } type TVar struct { LValue } func (t *TVar) Fix() Expr { return t // only variable that's not fixed } 3-3.11.1/script/ronly.go000066400000000000000000000011671503346766200147560ustar00rootroot00000000000000package script import "reflect" // read-only value (from script, but mutable from outside) type reflectROnly struct { elem reflect.Value } func newReflectROnly(addr interface{}) *reflectROnly { elem := reflect.ValueOf(addr) if elem.Kind() == 0 { panic("variable/constant needs to be passed as pointer to addressable value") } return &reflectROnly{elem} } func (l *reflectROnly) Eval() interface{} { return l.elem.Interface() } func (l *reflectROnly) Type() reflect.Type { return l.elem.Type() } func (l *reflectROnly) Child() []Expr { return nil } func (l *reflectROnly) Fix() Expr { return NewConst(l) } 3-3.11.1/script/script_test.go000066400000000000000000000067201503346766200161560ustar00rootroot00000000000000package script import ( "log" "math" "reflect" "testing" ) func init() { log.SetFlags(0) } func TestEval(t *testing.T) { w := NewWorld() // Test Variables x := 1.0 w.Var("x", &x) if w.MustEval("x") != 1.0 { t.Fail() } x = 2.0 if w.MustEval("x") != 2.0 { t.Fail() } w.MustExec("x=3") if w.MustEval("x") != 3.0 { t.Fail() } w.MustExec("y:=8") if w.MustEval("y") != 8 { t.Error("got", w.MustEval("y")) } // Test Ops if w.MustEval("1+2*3/4-5-6") != 1.+2.*3./4.-5.-6 { t.Fail() } // Test func if w.MustEval("sqrt(3*3)").(float64) != 3 { t.Fail() } } func TestContains(t *testing.T) { w := NewWorld() var x float64 w.Var("x", &x) X := w.Resolve("x") if X == nil { t.Fail() } if !Contains(w.MustCompile("x+1"), X) { t.Fail() } if Contains(w.MustCompile("1+1"), X) { t.Fail() } } func TestTypes(t *testing.T) { w := NewWorld() x := 3.14 w.Var("x", &x) w.MustExec("x=7") w.Func("printInt", func(x int) { log.Println(x) }) w.MustExec("printInt(7)") } func TestLoop(t *testing.T) { w := NewWorld() sum := 0.0 w.Var("sum", &sum) src := ` for i:=0; i<100; i++{ sum = sum + i } ` w.MustExec(src) if sum != 4950 { t.Error("got", sum) } src = ` for i:=100; i>=0; i--{ sum = sum + i } ` w.MustExec(src) if sum != 10000 { t.Error("got", sum) } } type test struct { a, b, c int } func (t *test) A() int { return 41 } func (t *test) B() int { return 42 } func (t *test) C() int { return 43 } func TestMethod(t *testing.T) { w := NewWorld() var s *test w.Var("s", &s) if w.MustEval("s.B()") != 42 { t.Fail() } } func TestScope(t *testing.T) { w := NewWorld() w.MustEval("sin(0)") w.EnterScope() w.MustEval("sin(0)") w.ExitScope() w.MustEval("sin(0)") } func BenchmarkEval1(b *testing.B) { b.StopTimer() w := NewWorld() code := w.MustCompileExpr("1+(2-3)*(4+5)/6") b.StartTimer() for i := 0; i < b.N; i++ { code.Eval() } } func BenchmarkEval1_native(bench *testing.B) { var a, b, c, d, e, f float64 for i := 0; i < bench.N; i++ { a += (b - c) * (d + e) / f } if a == 1 { panic("make sure result is used") } } func BenchmarkEval2(b *testing.B) { b.StopTimer() w := NewWorld() code := w.MustCompileExpr("sin(cos(tan(log(sqrt(exp(1))))))") b.StartTimer() for i := 0; i < b.N; i++ { code.Eval() } } func BenchmarkEval2_native(bench *testing.B) { var a float64 b := 1. for i := 0; i < bench.N; i++ { a += math.Sin(math.Cos(math.Tan(math.Log(math.Sqrt(math.Exp(b)))))) } if a == 1.23456 { panic("make sure result is used") } } type T struct { in string out interface{} } func TestMany(test *testing.T) { tests := []T{ {"1+1", 2.}, {"7-5", 2.}, {"2*3", 6.}, {"10/5", 2.}, {"1+10/5", 3.}, {"10/5+1", 3.}, {"(1+14)/5", 3.}, {"1<1", false}, {"1<2", true}, {"2<1", false}, {"1>1", false}, {"2>1", true}, {"1>2", false}, {"1<=1", true}, {"1<=2", true}, {"2<=1", false}, {"1>=1", true}, {"2>=1", true}, {"1>=2", false}} w := NewWorld() for _, t := range tests { out := w.MustEval(t.in) if !reflect.DeepEqual(out, t.out) { test.Error(t.in, "returned", out, "expected:", t.out) } } } // Test a few cases that should not compile func TestFail(test *testing.T) { w := NewWorld() w.Const("c", 3e8) a := 1. w.Var("a", &a) tests := []string{"c=1", "undefined", "1++", "a=true", "x:=a++"} for _, t := range tests { _, err := w.Compile(t) if err == nil { test.Error(t, "should not compile") } else { log.Println(t, ":", err, ":OK") } } } 3-3.11.1/script/selector.go000066400000000000000000000023401503346766200154250ustar00rootroot00000000000000package script import ( "fmt" "go/ast" "reflect" "strings" "unicode" ) const GoExclusiveMethodSuffix = "Go" type selector struct { x Expr method string } // compiles a selector statement like x.sel func (w *World) compileSelectorStmt(n *ast.SelectorExpr) Expr { x := w.compileExpr(n.X) t := x.Type() if t == nil { panic(err(n.Pos(), "void does not have member", n.Sel.Name)) } sel := strings.ToLower(n.Sel.Name) N := "" for i := 0; i < t.NumMethod(); i++ { name := t.Method(i).Name if strings.ToLower(name) == sel && unicode.IsUpper(rune(name[0])) && !strings.HasSuffix(name, GoExclusiveMethodSuffix) { N = t.Method(i).Name break } } if N == "" { panic(err(n.Pos(), t, "has no method", n.Sel.Name)) } return &selector{x, N} } func (e *selector) Eval() interface{} { obj := reflect.ValueOf(e.x.Eval()) meth := obj.MethodByName(e.method) if meth.Kind() == 0 { panic(fmt.Sprint(e.x, " has no method ", e.method)) } return meth.Interface() } func (e *selector) Type() reflect.Type { return reflect.New(e.x.Type()).Elem().MethodByName(e.method).Type() } func (e *selector) Child() []Expr { return []Expr{e.x} } func (e *selector) Fix() Expr { return &selector{x: e.x.Fix(), method: e.method} } 3-3.11.1/script/source.go000066400000000000000000000011301503346766200151010ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "os" ) func (w *World) compileSource(n *ast.CallExpr) Expr { if len(n.Args) != 1 { panic(err(n.Pos(), "source() needs 1 string argument, got", len(n.Args))) } arg := n.Args[0] if lit, ok := arg.(*ast.BasicLit); ok && lit.Kind == token.STRING { code, err1 := os.ReadFile(lit.Value[1 : len(lit.Value)-1]) if err1 != nil { panic(err(n.Pos(), err1)) } block, err2 := w.Compile(string(code)) if err2 != nil { panic(err(n.Pos(), err2)) } return block } else { panic(err(n.Pos(), "source() needs literal string argument")) } } 3-3.11.1/script/stdlib.go000066400000000000000000000056641503346766200151020ustar00rootroot00000000000000package script import ( "fmt" "math" "math/rand" "time" ) // Loads standard functions into the world. func (w *World) LoadStdlib() { // literals w.declare("true", boolLit(true)) w.declare("false", boolLit(false)) // math w.Func("abs", math.Abs) w.Func("acos", math.Acos) w.Func("acosh", math.Acosh) w.Func("asin", math.Asin) w.Func("asinh", math.Asinh) w.Func("atan", math.Atan) w.Func("atanh", math.Atanh) w.Func("cbrt", math.Cbrt) w.Func("ceil", math.Ceil) w.Func("cos", math.Cos) w.Func("cosh", math.Cosh) w.Func("erf", math.Erf) w.Func("erfc", math.Erfc) w.Func("exp", math.Exp) w.Func("exp2", math.Exp2) w.Func("expm1", math.Expm1) w.Func("floor", math.Floor) w.Func("gamma", math.Gamma) w.Func("j0", math.J0) w.Func("j1", math.J1) w.Func("log", math.Log) w.Func("log10", math.Log10) w.Func("log1p", math.Log1p) w.Func("log2", math.Log2) w.Func("logb", math.Logb) w.Func("sin", math.Sin) w.Func("sinh", math.Sinh) w.Func("sqrt", math.Sqrt) w.Func("tan", math.Tan) w.Func("tanh", math.Tanh) w.Func("trunc", math.Trunc) w.Func("y0", math.Y0) w.Func("y1", math.Y1) w.Func("ilogb", math.Ilogb) w.Func("pow10", math.Pow10) w.Func("atan2", math.Atan2) w.Func("hypot", math.Hypot) w.Func("remainder", math.Remainder) w.Func("max", math.Max) w.Func("min", math.Min) w.Func("mod", math.Mod) w.Func("pow", math.Pow) w.Func("yn", math.Yn) w.Func("jn", math.Jn) w.Func("ldexp", math.Ldexp) w.Func("isInf", math.IsInf) w.Func("isNaN", math.IsNaN) w.Func("norm", norm, "Standard normal distribution") w.Func("heaviside", heaviside, "Returns 1 if x>0, 0 if x<0, and 0.5 if x==0") w.Func("sinc", sinc, "Sinc returns sin(x)/x. If x=0, then Sinc(x) returns 1.") w.Func("randSeed", intseed, "Sets the random number seed for rand(), randExp(), randNorm() and randInt().") w.Func("rand", rng.Float64, "Random number between 0 and 1") w.Func("randExp", rng.ExpFloat64, "Exponentially distributed random number between 0 and +inf, mean=1") w.Func("randNorm", rng.NormFloat64, "Standard normal random number") w.Func("randInt", randInt, "Random non-negative integer") w.declare("pi", floatLit(math.Pi)) w.declare("inf", floatLit(math.Inf(1))) //string w.Func("sprint", fmt.Sprint, "Print all arguments to string with automatic formatting") w.Func("sprintf", fmt.Sprintf, "Print to string with C-style formatting.") //time w.Func("now", time.Now, "Returns the current time") w.Func("since", time.Since, "Returns the time elapsed since argument") } var rng = rand.New(rand.NewSource(0)) // script does not know int64 func intseed(seed int) { rng.Seed(int64(seed)) } func randInt(upper int) int { return rng.Int() % upper } func heaviside(x float64) float64 { switch { default: return 1 case x == 0: return 0.5 case x < 0: return 0 } } func norm(x float64) float64 { return (1 / math.Sqrt(2*math.Pi)) * math.Exp(-0.5*x*x) } func sinc(x float64) float64 { if x == 0 { return 1 } else { return math.Sin(x) / x } } 3-3.11.1/script/stmt.go000066400000000000000000000023041503346766200145740ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) // compiles expression or statement func (w *World) compile(n ast.Node) Expr { switch n := n.(type) { case ast.Stmt: return w.compileStmt(n) case ast.Expr: return w.compileExpr(n) default: panic(err(n.Pos(), "not allowed")) } } // compiles a statement func (w *World) compileStmt(st ast.Stmt) Expr { switch st := st.(type) { default: panic(err(st.Pos(), "not allowed:", typ(st))) case *ast.EmptyStmt: return &emptyStmt{} case *ast.AssignStmt: return w.compileAssignStmt(st) case *ast.ExprStmt: return w.compileExpr(st.X) case *ast.IfStmt: return w.compileIfStmt(st) case *ast.ForStmt: return w.compileForStmt(st) case *ast.IncDecStmt: return w.compileIncDecStmt(st) case *ast.BlockStmt: w.EnterScope() defer w.ExitScope() return w.compileBlockStmt_noScope(st) } } // embed to get Type() that returns nil type void struct{} func (v *void) Type() reflect.Type { return nil } func (v *void) Fix() Expr { panic(invalid_closure) } type emptyStmt struct{ void } func (*emptyStmt) Child() []Expr { return nil } func (*emptyStmt) Eval() interface{} { return nil } const invalid_closure = "illegal statement in closure" 3-3.11.1/script/test.txt000066400000000000000000000000051503346766200147720ustar00rootroot00000000000000a:=1 3-3.11.1/script/typeconv.go000066400000000000000000000106771503346766200154700ustar00rootroot00000000000000package script import ( "fmt" "github.com/mumax/3/data" "go/token" "reflect" ) // converts in to an expression of type OutT. // also serves as type check (not convertible == type error) // pos is used for error message on impossible conversion. func typeConv(pos token.Pos, in Expr, outT reflect.Type) Expr { inT := in.Type() switch { default: panic(err(pos, "type mismatch: can not use type", inT, "as", outT)) // treat 'void' (type nil) separately: case inT == nil && outT != nil: panic(err(pos, "void used as value")) case inT != nil && outT == nil: panic("script internal bug: void input type") // strict go conversions: case inT == outT: return in case inT.AssignableTo(outT): return in // extra conversions for ease-of-use: // int -> float64 case outT == float64_t && inT == int_t: return &intToFloat64{in} // float64 -> int case outT == int_t && inT == float64_t: return &float64ToInt{in} case outT == float64_t && inT.AssignableTo(ScalarIf_t): return &getScalar{in.Eval().(ScalarIf)} case outT == float64_t && inT.AssignableTo(VectorIf_t): return &getVector{in.Eval().(VectorIf)} // magical expression -> function conversions case inT == float64_t && outT.AssignableTo(ScalarFunction_t): return &scalFn{in} case inT == int_t && outT.AssignableTo(ScalarFunction_t): return &scalFn{&intToFloat64{in}} case inT == vector_t && outT.AssignableTo(VectorFunction_t): return &vecFn{in} case inT == bool_t && outT == func_bool_t: return &boolToFunc{in} } } // returns input type for expression. Usually this is the same as the return type, // unless the expression has a method InputType()reflect.Type. func inputType(e Expr) reflect.Type { if in, ok := e.(interface { InputType() reflect.Type }); ok { return in.InputType() } return e.Type() } // common type definitions var ( float64_t = reflect.TypeOf(float64(0)) bool_t = reflect.TypeOf(false) func_float64_t = reflect.TypeOf(func() float64 { panic(0) }) func_bool_t = reflect.TypeOf(func() bool { panic(0) }) int_t = reflect.TypeOf(int(0)) string_t = reflect.TypeOf("") vector_t = reflect.TypeOf(data.Vector{}) func_vector_t = reflect.TypeOf(func() data.Vector { panic(0) }) ScalarFunction_t = reflect.TypeOf(dummy_f).In(0) VectorFunction_t = reflect.TypeOf(dummy_f3).In(0) ScalarIf_t = reflect.TypeOf(dummy_scalarif).In(0) VectorIf_t = reflect.TypeOf(dummy_vectorif).In(0) ) // maneuvers to get interface type of Func (simpler way?) func dummy_f(ScalarFunction) {} func dummy_f3(VectorFunction) {} func dummy_scalarif(ScalarIf) {} func dummy_vectorif(VectorIf) {} // converts int to float64 type intToFloat64 struct{ in Expr } func (c *intToFloat64) Eval() interface{} { return float64(c.in.Eval().(int)) } func (c *intToFloat64) Type() reflect.Type { return float64_t } func (c *intToFloat64) Child() []Expr { return []Expr{c.in} } func (c *intToFloat64) Fix() Expr { return &intToFloat64{in: c.in.Fix()} } // converts float64 to int type float64ToInt struct{ in Expr } func (c *float64ToInt) Eval() interface{} { return safe_int(c.in.Eval().(float64)) } func (c *float64ToInt) Type() reflect.Type { return int_t } func (c *float64ToInt) Child() []Expr { return []Expr{c.in} } func (c *float64ToInt) Fix() Expr { return &float64ToInt{in: c.in.Fix()} } type boolToFunc struct{ in Expr } func (c *boolToFunc) Eval() interface{} { return func() bool { return c.in.Eval().(bool) } } func (c *boolToFunc) Type() reflect.Type { return func_bool_t } func (c *boolToFunc) Child() []Expr { return []Expr{c.in} } func (c *boolToFunc) Fix() Expr { return &boolToFunc{in: c.in.Fix()} } type getScalar struct{ in ScalarIf } type getVector struct{ in VectorIf } func (c *getScalar) Eval() interface{} { return c.in.Get() } func (c *getScalar) Type() reflect.Type { return float64_t } func (c *getScalar) Child() []Expr { return nil } func (c *getScalar) Fix() Expr { return NewConst(c) } func (c *getVector) Eval() interface{} { return c.in.Get() } func (c *getVector) Type() reflect.Type { return vector_t } func (c *getVector) Child() []Expr { return nil } func (c *getVector) Fix() Expr { return NewConst(c) } func safe_int(x float64) int { i := int(x) if float64(i) != x { panic(fmt.Errorf("can not use %v as int", x)) } return i } type ScalarIf interface { Get() float64 } // TODO: Scalar type VectorIf interface { Get() data.Vector } // TODO: Vector 3-3.11.1/script/unaryexpr.go000066400000000000000000000015731503346766200156510ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) func (w *World) compileUnaryExpr(n *ast.UnaryExpr) Expr { x := w.compileExpr(n.X) switch n.Op { default: panic(err(n.Pos(), "not allowed:", n.Op)) case token.SUB: return &minus{typeConv(n.X.Pos(), x, float64_t)} case token.NOT: return ¬{typeConv(n.X.Pos(), x, bool_t)} } } type minus struct{ x Expr } func (m *minus) Type() reflect.Type { return float64_t } func (m *minus) Eval() interface{} { return -m.x.Eval().(float64) } func (m *minus) Child() []Expr { return []Expr{m.x} } func (m *minus) Fix() Expr { return &minus{m.x.Fix()} } type not struct{ x Expr } func (m *not) Type() reflect.Type { return bool_t } func (m *not) Eval() interface{} { return !m.x.Eval().(bool) } func (m *not) Child() []Expr { return []Expr{m.x} } func (m *not) Fix() Expr { return ¬{m.x.Fix()} } 3-3.11.1/script/world.go000066400000000000000000000074301503346766200147410ustar00rootroot00000000000000// package script provides a script interpreter for input files and GUI commands. package script import ( "fmt" "go/token" "strings" ) // World stores an interpreted program's state // like declared variables and functions. type World struct { *scope toplevel *scope } // scope stores identifiers type scope struct { Identifiers map[string]Expr // set of defined identifiers parent *scope // parent scope, if any Doc map[string]string // documentation for identifiers } func NewWorld() *World { w := new(World) w.scope = new(scope) w.toplevel = w.scope w.toplevel.Doc = make(map[string]string) w.LoadStdlib() // loads into toplevel return w } func (w *scope) init() { if w.Identifiers == nil { w.Identifiers = make(map[string]Expr) } } // adds a native variable to the world. E.g.: // // var x = 3.14 // world.Var("x", &x) // world.MustEval("x") // returns 3.14 func (w *scope) Var(name string, addr interface{}, doc ...string) { w.declare(name, newReflectLvalue(addr), doc...) } // Hack for fixing the closure caveat: // Declare the time variable, the only variable closures close over. func (w *scope) TVar(name string, addr interface{}, doc ...string) { w.declare(name, &TVar{newReflectLvalue(addr)}, doc...) } // adds a native variable to the world. It cannot be changed from script. // // var x = 3.14 // world.ROnly("x", &x) // world.MustEval("x") // returns 3.14 // world.MustExec("x=2") // fails: cannot assign to x func (w *scope) ROnly(name string, addr interface{}, doc ...string) { w.declare(name, newReflectROnly(addr), doc...) } // adds a constant. Cannot be changed in any way. func (w *scope) Const(name string, val interface{}, doc ...string) { switch v := val.(type) { default: panic(fmt.Errorf("const of type %v not handled", typ(v))) // todo: const using reflection case float64: w.declare(name, floatLit(v), doc...) case int: w.declare(name, intLit(v), doc...) } } // adds a special variable to the world. Upon assignment, // v's Set() will be called. func (w *scope) LValue(name string, v LValue, doc ...string) { w.declare(name, v, doc...) } // adds a native function to the world. E.g.: // // world.Func("sin", math.Sin) // world.MustEval("sin(0)") // returns 0 func (w *scope) Func(name string, f interface{}, doc ...string) { w.declare(name, newFunction(f), doc...) } // add identifier but check that it's not declared yet. func (w *scope) declare(key string, value Expr, doc ...string) { if ok := w.safeDeclare(key, value); !ok { panic("identifier " + key + " already defined") } w.document(key, doc...) } func (w *scope) safeDeclare(key string, value Expr) (ok bool) { w.init() lname := strings.ToLower(key) if _, ok := w.Identifiers[lname]; ok { return false } w.Identifiers[lname] = value return true } // resolve identifier in this scope or its parents func (w *scope) resolve(pos token.Pos, name string) Expr { w.init() lname := strings.ToLower(name) if v, ok := w.Identifiers[lname]; ok { return v } else { if w.parent != nil { return w.parent.resolve(pos, name) } panic(err(pos, "undefined:", name)) } } func (w *World) Resolve(identifier string) (e Expr) { defer func() { err := recover() if err != nil { e = nil // not found } }() e = w.toplevel.resolve(0, identifier) return } // add documentation for identifier func (w *scope) document(ident string, doc ...string) { if w.Doc != nil { // means we want doc for this scope (toplevel only) switch len(doc) { default: panic("too many doc strings for " + ident) case 0: w.Doc[ident] = "" case 1: w.Doc[ident] = doc[0] } } } func (w *World) EnterScope() { par := w.scope w.scope = new(scope) w.scope.parent = par } func (w *World) ExitScope() { w.scope = w.scope.parent if w.scope == nil { // went above toplevel panic("bug") } } 3-3.11.1/svgo/000077500000000000000000000000001503346766200127315ustar00rootroot000000000000003-3.11.1/svgo/LICENSE000066400000000000000000000002401503346766200137320ustar00rootroot00000000000000The contents of this repository are Licensed under the Creative Commons Attribution 3.0 license as described in http://creativecommons.org/licenses/by/3.0/us/ 3-3.11.1/svgo/Makefile000066400000000000000000000000211503346766200143620ustar00rootroot00000000000000all: go install 3-3.11.1/svgo/doc.go000066400000000000000000000054341503346766200140330ustar00rootroot00000000000000/* Package svg generates SVG as defined by the Scalable Vector Graphics 1.1 Specification (). Output goes to the specified io.Writer. # Supported SVG elements and functions Shapes, lines, text circle, ellipse, polygon, polyline, rect (including roundrects), line, text Paths general, arc, cubic and quadratic bezier paths, Image and Gradients image, linearGradient, radialGradient, Transforms translate, rotate, scale, skewX, skewY Filter Effects filter, feBlend, feColorMatrix, feColorMatrix, feComponentTransfer, feComposite, feConvolveMatrix, feDiffuseLighting, feDisplacementMap, feDistantLight, feFlood, feGaussianBlur, feImage, feMerge, feMorphology, feOffset, fePointLight, feSpecularLighting, feSpotLight,feTile, feTurbulence Metadata elements desc, defs, g (style, transform, id), mask, marker, pattern, title, (a)ddress, link, script, use Usage: (assuming GOPATH is set) go get github.com/ajstarks/svgo go install github.com/ajstarks/svgo/... You can use godoc to browse the documentation from the command line: $ godoc github.com/ajstarks/svgo a minimal program, to generate SVG to standard output. package main import ( "github.com/ajstarks/svgo" "os" ) func main() { width := 500 height := 500 canvas := svg.New(os.Stdout) canvas.Start(width, height) canvas.Circle(width/2, height/2, 100) canvas.Text(width/2, height/2, "Hello, SVG", "text-anchor:middle;font-size:30px;fill:white") canvas.End() } Drawing in a web server: (http://localhost:2003/circle) package main import ( "log" "github.com/ajstarks/svgo" "net/http" ) func main() { http.Handle("/circle", http.HandlerFunc(circle)) err := http.ListenAndServe(":2003", nil) if err != nil { log.Fatal("ListenAndServe:", err) } } func circle(w http.ResponseWriter, req *http.Request) { w.Header().Set("Content-Type", "image/svg+xml") s := svg.New(w) s.Start(500, 500) s.Circle(250, 250, 125, "fill:none;stroke:black") s.End() } # Functions and types Many functions use x, y to specify an object's location, and w, h to specify the object's width and height. Where applicable, a final optional argument specifies the style to be applied to the object. The style strings follow the SVG standard; name:value pairs delimited by semicolons, or a series of name="value" pairs. For example: `"fill:none; opacity:0.3"` or `fill="none" opacity="0.3"` (see: ) The Offcolor type: type Offcolor struct { Offset uint8 Color string Opacity float } is used to specify the offset, color, and opacity of stop colors in linear and radial gradients The Filterspec type: type Filterspec struct { In string In2 string Result string } is used to specify inputs and results for filter effects */ package svg 3-3.11.1/svgo/svg.go000066400000000000000000001037371503346766200140720ustar00rootroot00000000000000// Package svg provides an API for generating Scalable Vector Graphics (SVG) // Edited by Arne Vansteenkiste, 2014: // allow non-integer coordinates package svg // package main // // import ( // "github.com/ajstarks/svgo" // "os" // ) // // var ( // width = 500 // height = 500 // canvas = svg.New(os.Stdout) // ) // // func main() { // canvas.Start(width, height) // canvas.Circle(width/2, height/2, 100) // canvas.Text(width/2, height/2, "Hello, SVG", // "text-anchor:middle;font-size:30px;fill:white") // canvas.End() // } // import ( "fmt" "io" "encoding/xml" "strings" ) // SVG defines the location of the generated SVG type SVG struct { Writer io.Writer } // Offcolor defines the offset and color for gradients type Offcolor struct { Offset uint8 Color string Opacity float64 } // Filterspec defines the specification of SVG filters type Filterspec struct { In, In2, Result string } const ( svginit = ` ` vbfmt = `viewBox="%d %d %d %d"` emptyclose = "/>\n" ) // New is the SVG constructor, specifying the io.Writer where the generated SVG is written. func New(w io.Writer) *SVG { return &SVG{w} } func (svg *SVG) print(a ...interface{}) (n int, errno error) { return fmt.Fprint(svg.Writer, a...) } func (svg *SVG) println(a ...interface{}) (n int, error error) { return fmt.Fprintln(svg.Writer, a...) } func (svg *SVG) printf(format string, a ...interface{}) (n int, errno error) { return fmt.Fprintf(svg.Writer, format, a...) } // Structure, Metadata, Scripting, Transformation, and Links // Start begins the SVG document with the width w and height h. // Other attributes may be optionally added, for example viewbox or additional namespaces // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#SVGElement func (svg *SVG) Start(w int, h int, ns ...string) { svg.printf(svginit, w, h) for _, v := range ns { svg.printf("\n %s", v) } svg.println(svgns) } // Startview begins the SVG document, with the specified width, height, and viewbox func (svg *SVG) Startview(w, h, minx, miny, vw, vh int) { svg.Start(w, h, fmt.Sprintf(vbfmt, minx, miny, vw, vh)) } // End the SVG document func (svg *SVG) End() { svg.println("") } // Script defines a script with a specified type, (for example "application/javascript"). // if the first variadic argument is a link, use only the link reference. // Otherwise, treat those arguments as the text of the script (marked up as CDATA). // if no data is specified, just close the script element func (svg *SVG) Script(scriptype string, data ...string) { svg.printf(`\n") default: svg.println(`/>`) } } // Gstyle begins a group, with the specified style. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#GElement func (svg *SVG) Gstyle(s string) { svg.println(group("style", s)) } // Gtransform begins a group, with the specified transform // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Gtransform(s string) { svg.println(group("transform", s)) } // Translate begins coordinate translation, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Translate(x, y int) { svg.Gtransform(translate(x, y)) } // Scale scales the coordinate system by n, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Scale(n float64) { svg.Gtransform(scale(n)) } // ScaleXY scales the coordinate system by dx and dy, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) ScaleXY(dx, dy float64) { svg.Gtransform(scaleXY(dx, dy)) } // SkewX skews the x coordinate system by angle a, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) SkewX(a float64) { svg.Gtransform(skewX(a)) } // SkewY skews the y coordinate system by angle a, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) SkewY(a float64) { svg.Gtransform(skewY(a)) } // SkewXY skews x and y coordinates by ax, ay respectively, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) SkewXY(ax, ay float64) { svg.Gtransform(skewX(ax) + " " + skewY(ay)) } // Rotate rotates the coordinate system by r degrees, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Rotate(r float64) { svg.Gtransform(rotate(r)) } // TranslateRotate translates the coordinate system to (x,y), then rotates to r degrees, end with Gend() func (svg *SVG) TranslateRotate(x, y int, r float64) { svg.Gtransform(translate(x, y) + " " + rotate(r)) } // RotateTranslate rotates the coordinate system r degrees, then translates to (x,y), end with Gend() func (svg *SVG) RotateTranslate(x, y int, r float64) { svg.Gtransform(rotate(r) + " " + translate(x, y)) } // Group begins a group with arbitrary attributes func (svg *SVG) Group(s ...string) { svg.printf("`)) } // Gid begins a group, with the specified id func (svg *SVG) Gid(s string) { svg.print(``) } // Gend ends a group (must be paired with Gsttyle, Gtransform, Gid). func (svg *SVG) Gend() { svg.println(``) } // ClipPath defines a clip path func (svg *SVG) ClipPath(s ...string) { svg.printf(``)) } // ClipEnd ends a ClipPath func (svg *SVG) ClipEnd() { svg.println(``) } // Def begins a defintion block. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#DefsElement func (svg *SVG) Def() { svg.println(``) } // DefEnd ends a defintion block. func (svg *SVG) DefEnd() { svg.println(``) } // Marker defines a marker // Standard reference: http://www.w3.org/TR/SVG11/painting.html#MarkerElement func (svg *SVG) Marker(id string, x, y, width, height int, s ...string) { svg.printf(`\n")) } // MarkEnd ends a marker func (svg *SVG) MarkerEnd() { svg.println(``) } // Pattern defines a pattern with the specified dimensions. // The putype can be either "user" or "obj", which sets the patternUnits // attribute to be either userSpaceOnUse or objectBoundingBox // Standard reference: http://www.w3.org/TR/SVG11/pservers.html#Patterns func (svg *SVG) Pattern(id string, x, y, width, height int, putype string, s ...string) { puattr := "userSpaceOnUse" if putype != "user" { puattr = "objectBoundingBox" } svg.printf(`\n")) } // PatternEnd ends a marker func (svg *SVG) PatternEnd() { svg.println(``) } // Desc specified the text of the description tag. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#DescElement func (svg *SVG) Desc(s string) { svg.tt("desc", s) } // Title specified the text of the title tag. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#TitleElement func (svg *SVG) Title(s string) { svg.tt("title", s) } // Link begins a link named "name", with the specified title. // Standard Reference: http://www.w3.org/TR/SVG11/linking.html#Links func (svg *SVG) Link(href string, title string) { svg.printf("") } // LinkEnd ends a link. func (svg *SVG) LinkEnd() { svg.println(``) } // Use places the object referenced at link at the location x, y, with optional style. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#UseElement func (svg *SVG) Use(x int, y int, link string, s ...string) { svg.printf(``)) } // MaskEnd ends a Mask. func (svg *SVG) MaskEnd() { svg.println(``) } // Shapes // Circle centered at x,y, with radius r, with optional style. // Standard Reference: http://www.w3.org/TR/SVG11/shapes.html#CircleElement func (svg *SVG) Circle(x int, y int, r int, s ...string) { svg.printf(`")) xml.Escape(svg.Writer, []byte(t)) svg.println(``) } // Textpath places text optionally styled text along a previously defined path // Standard Reference: http://www.w3.org/TR/SVG11/text.html#TextPathElement func (svg *SVG) Textpath(t string, pathid string, s ...string) { svg.printf("", endstyle(s, ">"), pathid) xml.Escape(svg.Writer, []byte(t)) svg.println(``) } // Textlines places a series of lines of text starting at x,y, at the specified size, fill, and alignment. // Each line is spaced according to the spacing argument func (svg *SVG) Textlines(x, y int, s []string, size, spacing int, fill, align string) { svg.Gstyle(fmt.Sprintf("font-size:%dpx;fill:%s;text-anchor:%s", size, fill, align)) for _, t := range s { svg.Text(x, y, t) y += spacing } svg.Gend() } // Colors // RGB specifies a fill color in terms of a (r)ed, (g)reen, (b)lue triple. // Standard reference: http://www.w3.org/TR/css3-color/ func (svg *SVG) RGB(r int, g int, b int) string { return fmt.Sprintf(`fill:rgb(%d,%d,%d)`, r, g, b) } // RGBA specifies a fill color in terms of a (r)ed, (g)reen, (b)lue triple and opacity. func (svg *SVG) RGBA(r int, g int, b int, a float64) string { return fmt.Sprintf(`fill-opacity:%.2f; %s`, a, svg.RGB(r, g, b)) } // Gradients // LinearGradient constructs a linear color gradient identified by id, // along the vector defined by (x1,y1), and (x2,y2). // The stop color sequence defined in sc. Coordinates are expressed as percentages. func (svg *SVG) LinearGradient(id string, x1, y1, x2, y2 uint8, sc []Offcolor) { svg.printf("\n", id, pct(x1), pct(y1), pct(x2), pct(y2)) svg.stopcolor(sc) svg.println("") } // RadialGradient constructs a radial color gradient identified by id, // centered at (cx,cy), with a radius of r. // (fx, fy) define the location of the focal point of the light source. // The stop color sequence defined in sc. // Coordinates are expressed as percentages. func (svg *SVG) RadialGradient(id string, cx, cy, r, fx, fy uint8, sc []Offcolor) { svg.printf("\n", id, pct(cx), pct(cy), pct(r), pct(fx), pct(fy)) svg.stopcolor(sc) svg.println("") } // stopcolor is a utility function used by the gradient functions // to define a sequence of offsets (expressed as percentages) and colors func (svg *SVG) stopcolor(oc []Offcolor) { for _, v := range oc { svg.printf("\n", pct(v.Offset), v.Color, v.Opacity) } } // Filter Effects: // Most functions have common attributes (in, in2, result) defined in type Filterspec // used as a common first argument. // Filter begins a filter set // Standard reference: http://www.w3.org/TR/SVG11/filters.html#FilterElement func (svg *SVG) Filter(id string, s ...string) { svg.printf(`\n")) } // Fend ends a filter set // Standard reference: http://www.w3.org/TR/SVG11/filters.html#FilterElement func (svg *SVG) Fend() { svg.println(``) } // FeBlend specifies a Blend filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feBlendElement func (svg *SVG) FeBlend(fs Filterspec, mode string, s ...string) { switch mode { case "normal", "multiply", "screen", "darken", "lighten": break default: mode = "normal" } svg.printf(` 360 { value = 0 } svg.printf(` 1 { value = 1 } svg.printf(``) } // FeCompEnd ends a feComponent filter element // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feComponentTransferElement func (svg *SVG) FeCompEnd() { svg.println(``) } // FeComposite specifies a feComposite filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feCompositeElement func (svg *SVG) FeComposite(fs Filterspec, operator string, k1, k2, k3, k4 int, s ...string) { switch operator { case "over", "in", "out", "atop", "xor", "arithmetic": break default: operator = "over" } svg.printf(``)) } // FeDiffEnd ends a diffuse lighting filter primitive container // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feDiffuseLightingElement func (svg *SVG) FeDiffEnd() { svg.println(``) } // FeDisplacementMap specifies a feDisplacementMap filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feDisplacementMapElement func (svg *SVG) FeDisplacementMap(fs Filterspec, scale float64, xchannel, ychannel string, s ...string) { svg.printf(``) for _, n := range nodes { svg.printf("\n", n) } svg.println(``) } // FeMorphology specifies a feMorphologyLight filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feMorphologyElement func (svg *SVG) FeMorphology(fs Filterspec, operator string, xradius, yradius float64, s ...string) { switch operator { case "erode", "dilate": break default: operator = "erode" } svg.printf(`\n")) } // FeSpecEnd ends a specular lighting filter primitive container // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feSpecularLightingElement func (svg *SVG) FeSpecEnd() { svg.println(``) } // FeSpotLight specifies a feSpotLight filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feSpotLightElement func (svg *SVG) FeSpotLight(fs Filterspec, x, y, z, px, py, pz float64, s ...string) { svg.printf(` 1 { bfx = 0 } if bfy < 0 || bfy > 1 { bfy = 0 } switch ftype[0:1] { case "f", "F": ftype = "fractalNoise" case "t", "T": ftype = "turbulence" default: ftype = "turbulence" } var ss string if stitch { ss = "stitch" } else { ss = "noStitch" } svg.printf(` 0 { svg.Gstyle(s[0]) } for ix := x; ix <= x+w; ix += n { svg.Line(ix, y, ix, y+h) } for iy := y; iy <= y+h; iy += n { svg.Line(x, iy, x+w, iy) } if len(s) > 0 { svg.Gend() } } // Support functions // style returns a style name,attribute string func style(s string) string { if len(s) > 0 { return fmt.Sprintf(`style="%s"`, s) } return s } // pp returns a series of polygon points func (svg *SVG) pp(x []float64, y []float64, tag string) { svg.print(tag) if len(x) != len(y) { svg.print(" ") return } lx := len(x) - 1 for i := 0; i < lx; i++ { svg.print(coord(x[i], y[i]) + " ") } svg.print(coord(x[lx], y[lx])) } // endstyle modifies an SVG object, with either a series of name="value" pairs, // or a single string containing a style func endstyle(s []string, endtag string) string { if len(s) > 0 { nv := "" for i := 0; i < len(s); i++ { if strings.Index(s[i], "=") > 0 { nv += (s[i]) + " " } else { nv += style(s[i]) } } return nv + endtag } return endtag } // tt creates a xml element, tag containing s func (svg *SVG) tt(tag string, s string) { svg.print("<" + tag + ">") xml.Escape(svg.Writer, []byte(s)) svg.println("") } // poly compiles the polygon element func (svg *SVG) poly(x []float64, y []float64, tag string, s ...string) { svg.pp(x, y, "<"+tag+" points=\"") svg.print(`" ` + endstyle(s, "/>\n")) } // onezero returns "0" or "1" func onezero(flag bool) string { if flag { return "1" } return "0" } // pct returns a percetage, capped at 100 func pct(n uint8) uint8 { if n > 100 { return 100 } return n } // islink determines if a string is a script reference func islink(link string) bool { return strings.HasPrefix(link, "http://") || strings.HasPrefix(link, "#") || strings.HasPrefix(link, "../") || strings.HasPrefix(link, "./") } // group returns a group element func group(tag string, value string) string { return fmt.Sprintf(``, tag, value) } // scale return the scale string for the transform func scale(n float64) string { return fmt.Sprintf(`scale(%g)`, n) } // scaleXY return the scale string for the transform func scaleXY(dx, dy float64) string { return fmt.Sprintf(`scale(%g,%g)`, dx, dy) } // skewx returns the skewX string for the transform func skewX(angle float64) string { return fmt.Sprintf(`skewX(%g)`, angle) } // skewx returns the skewX string for the transform func skewY(angle float64) string { return fmt.Sprintf(`skewY(%g)`, angle) } // rotate returns the rotate string for the transform func rotate(r float64) string { return fmt.Sprintf(`rotate(%g)`, r) } // translate returns the translate string for the transform func translate(x, y int) string { return fmt.Sprintf(`translate(%d,%d)`, x, y) } // coord returns a coordinate string func coord(x interface{}, y interface{}) string { return fmt.Sprintf(`%v,%v`, x, y) } // ptag returns the beginning of the path element func ptag(x int, y int) string { return fmt.Sprintf(` 0 { attrs += fmt.Sprintf(`in="%s" `, s.In) } if len(s.In2) > 0 { attrs += fmt.Sprintf(`in2="%s" `, s.In2) } if len(s.Result) > 0 { attrs += fmt.Sprintf(`result="%s" `, s.Result) } return attrs } // tablevalues outputs a series of values as a XML attribute func (svg *SVG) tablevalues(s string, t []float64) { svg.printf(` %s="`, s) for i := 0; i < len(t)-1; i++ { svg.printf("%g ", t[i]) } svg.printf(`%g"%s`, t[len(t)-1], emptyclose) } // imgchannel validates the image channel indicator func imgchannel(c string) string { switch c { case "R", "G", "B", "A": return c case "r", "g", "b", "a": return strings.ToUpper(c) case "red", "green", "blue", "alpha": return strings.ToUpper(c[0:1]) case "Red", "Green", "Blue", "Alpha": return c[0:1] } return "R" } 3-3.11.1/test/000077500000000000000000000000001503346766200127325ustar00rootroot000000000000003-3.11.1/test/.gitignore000066400000000000000000000000231503346766200147150ustar00rootroot00000000000000*.out *.ovf *.todo 3-3.11.1/test/anisenergy.mx3000066400000000000000000000012721503346766200155310ustar00rootroot00000000000000/* Test conservation of energy with anisotropy. */ SetGridSize(32, 10, 2) c := 1e-9 SetCellSize(c, 2*c, 3*c) EnableDemag = false Aex = 10e-12 Msat = 1000e3 AnisU = vector(0, 0, 1) Ku1 = 1e6 m = uniform(1, 0, 0.1) tableadd(E_total) tableautosave(1e-12) // Get idea of energy scale E0 := E_total.get() alpha = 1 run(1e-9) E1 := E_total.get() Delta1 := E1-E0 print("DeltaE, damped:", Delta1) m = uniform(1, 0, 0.1) E0 = E_total.get() alpha = 0 run(1e-9) E1 = E_total.get() Delta2 := E1-E0 print("DeltaE, undamped:", Delta2) ratio := abs(Delta2/Delta1) print("ratio:", ratio) // test relative energy non-conservation up to 1ppm. expect("Relative energy non-conservation:", ratio, 0, 1e-6) 3-3.11.1/test/anisenergy2.mx3000066400000000000000000000012721503346766200156130ustar00rootroot00000000000000/* Test conservation of energy with anisotropy. */ SetGridSize(32, 10, 2) c := 1e-9 SetCellSize(c, 2*c, 3*c) EnableDemag = false Aex = 10e-12 AnisU = vector(0, 0, 1) Ku2 = 1e6 Msat = 1000e3 m = uniform(1, 0, 0.1) tableadd(E_total) tableautosave(1e-12) // Get idea of energy scale E0 := E_total.get() alpha = 1 run(1e-9) E1 := E_total.get() Delta1 := E1-E0 print("DeltaE, damped:", Delta1) m = uniform(1, 0, 0.1) E0 = E_total.get() alpha = 0 run(1e-9) E1 = E_total.get() Delta2 := E1-E0 print("DeltaE, undamped:", Delta2) ratio := abs(Delta2/Delta1) print("ratio:", ratio) // test relative energy non-conservation up to 1ppm. expect("Relative energy non-conservation:", ratio, 0, 1e-6) 3-3.11.1/test/anisenergyconservation.mx3000066400000000000000000000010321503346766200201560ustar00rootroot00000000000000/* Test anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 maxDt = 1e-13 Ku1 = 1e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } 3-3.11.1/test/anisenergyconservation2.mx3000066400000000000000000000010611503346766200202420ustar00rootroot00000000000000/* Test higher-order anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 maxDt = 1e-13 Ku1 = 1e5 Ku2 = 2e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } 3-3.11.1/test/anisenergyconservation3.mx3000066400000000000000000000010611503346766200202430ustar00rootroot00000000000000/* Test higher-order anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 Kc2 = 1e3 maxDt = 1e-13 Ku1 = 1e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } 3-3.11.1/test/anisenergyconservation4.mx3000066400000000000000000000010611503346766200202440ustar00rootroot00000000000000/* Test higher-order anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 Kc3 = 1e3 maxDt = 1e-13 Ku1 = 1e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } 3-3.11.1/test/antenna.go000066400000000000000000000020701503346766200147040ustar00rootroot00000000000000//go:build ignore // +build ignore package main import ( "github.com/mumax/3/data" . "github.com/mumax/3/engine" "github.com/mumax/3/oommf" "math" "os" ) const Mu0 = 4 * math.Pi * 1e-7 func main() { defer InitAndClose()() Nx := 512 Ny := 128 Nz := 1 cellsize := 5.0e-9 SetGridSize(Nx, Ny, Nz) thickness := 40e-9 length := float64(Nx) * cellsize SetCellSize(cellsize, cellsize, thickness/float64(Nz)) mask := data.NewSlice(3, Mesh().Size()) wireX := -length * 0.45 wireZ := thickness * 5.0 for h := 0; h < 10; h++ { for i := 0; i < Nx; i++ { for j := 0; j < Ny; j++ { r := Index2Coord(i, j, 0) r = r.Sub(Vector(wireX+float64(h)*cellsize, r.Y(), wireZ)) B := Vector(0, 0, 0) current := Vector(0, 1, 0) B = r.Cross(current).Mul(Mu0 / (2 * math.Pi * math.Pow(r.Len(), 2))) mask.Set(0, i, j, 0, B.X()) mask.Set(1, i, j, 0, B.Y()) mask.Set(2, i, j, 0, B.Z()) } } } f, _ := os.OpenFile("antenna.ovf", os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) defer f.Close() oommf.WriteOVF2(f, mask, data.Meta{}, "binary 4") } 3-3.11.1/test/average.mx3000066400000000000000000000044541503346766200150040ustar00rootroot00000000000000/* Test for averages with a non-universe geometry. Magnetization should be averaged over the geometry, while others should average over the box (parameters, external excitations). Finally, test averages over a region (ignores geometry). */ N := 512 c := 1e-9 setgridsize(N, N, 1) setcellsize(c, c, c) setgeom(circle(N*c)) tol := 1e-4 // tolerance limited by FD circle approximation defregion(1, xrange(-inf, 0)) defregion(2, xrange( 0, inf)) m = uniform(0, 1, 0) // Magnetization should average over the geometry expectv("m", m.average(), vector(0, 1, 0), tol) expect("mx", m.comp(0).average(), 0, tol) expect("my", m.comp(1).average(), 1, tol) expect("mz", m.comp(2).average(), 0, tol) // Average over region ignores geometry, // so here average m feels the surface of the disk expectv("m1", m.region(1).average(), vector(0, pi/4, 0), tol) expect("m1x", m.region(1).average()[0], 0, tol) expect("m1y", m.region(1).average()[1], pi/4, tol) expect("m1z", m.region(1).average()[2], 0, tol) expect("m1x", m.comp(0).region(1).average(), 0, tol) expect("m1y", m.comp(1).region(1).average(), pi/4, tol) expect("m1z", m.comp(2).region(1).average(), 0, tol) // Material parameter is set everywhere and averaged over the box alpha = 2 expect("alpha", alpha.average(), 2, tol) expect("alpha", alpha.region(1).average(), 2, tol) expect("alpha", alpha.region(2).average(), 2, tol) alpha.setRegion(1, 3) expect("alpha", alpha.average(), (2.+3.)/2., tol) // average of 2 and 3 expect("alpha", alpha.region(1).average(), 3, tol) expect("alpha", alpha.region(2).average(), 2, tol) // Excitation is set everywhere and averaged everywhere B_ext = vector(1, 2, 3) expectv("B_ext", B_ext.average(), vector(1, 2, 3), tol) expect("B_ext_x", B_ext.comp(0).average(), 1, tol) expect("B_ext_y", B_ext.comp(1).average(), 2, tol) expect("B_ext_z", B_ext.comp(2).average(), 3, tol) expectv("B_ext_1", B_ext.region(1).average(), vector(1, 2, 3), tol) expect("B_ext_1x", B_ext.region(1).average()[0], 1, tol) expect("B_ext_1y", B_ext.region(1).average()[1], 2, tol) expect("B_ext_1z", B_ext.region(1).average()[2], 3, tol) expect("B_ext_1x", B_ext.comp(0).region(1).average(), 1, tol) expect("B_ext_1y", B_ext.comp(1).region(1).average(), 2, tol) expect("B_ext_1z", B_ext.comp(2).region(1).average(), 3, tol) 3-3.11.1/test/axes.mx3000066400000000000000000000014021503346766200143200ustar00rootroot00000000000000/* Not really a test. Makes a snapshot showing the orientation of our axes. */ setgridsize(400, 300, 1) c := 1e-9 setcellsize(c, c, c) S := 8 * c I := rect(S, 5*S) X := I.rotz(35*pi/180).add(I.rotz(-35*pi/180)) X = X.transl(12*S, -2*S, 0) I = rect(S, 2.5*S) Y := I.transl(0, 1.2*S, 0).rotz(35*pi/180) Y = Y.add(I.transl(0, 1.2*S, 0).rotz(-35*pi/180)) Y = Y.add(I.transl(0, -1.2*S, 0)) Y = Y.transl(-2*S, 13*S, 0) head := rect(2*S, 2*S).rotz(pi/4).intersect(yrange(0, inf)) I = rect(S, 8*S).transl(0, 4*S, 0).add(head.transl(0, 8*S, 0)) O := circle(2*S) axes := X.add(Y).add(I).add(I.rotz(-pi/2)).add(O) disk := circle(12*S).transl(10*S, 10*S, 0) m = uniform(0, 0, 1) m.SetInShape(disk,vortex(1, 1).transl(10*S, 10*S, 0)) setgeom(axes.add(disk)) snapshot(m) 3-3.11.1/test/b_ext_add.mx3000066400000000000000000000025071503346766200153000ustar00rootroot00000000000000/* Construct an external field mask on-the fly and add to B_ext. Test that it actually gets added. */ Nx := 128 Ny := 64 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 800e3 Aex = 13e-12 m = uniform(1, 0, 0) // External field mask corresponding to Oersted field of a long perpendicular wire mask := newSlice(3, Nx, Ny, 1) // wire position, diameter and current direction wireX := 0e-9 wireY := 0e-9 wireDiam := 50e-9 current := vector(0, 0, 1) // construct mask for i:=0; i= wireDiam{ // outside wire b = r.cross(current).mul(mu0 / (2*pi*pow(r.len(), 2))) }else{ // inside wire relDist := r.len() / wireDiam innerCurrent := current.mul(relDist * relDist) b = r.cross(innerCurrent).mul(mu0 / (2*pi*pow(r.len(), 2))) } mask.set(0, i, j, 0, b[0]) mask.set(1, i, j, 0, b[1]) mask.set(2, i, j, 0, b[2]) } } // Add mask with multiplier B_ext=vector(0,0,-1e-4) B_ext.add(mask, 0.1) save(B_ext) relax() //alpha = 3 //RunWhile(MaxTorque > 1e-4) //steps(1000) // Check whether m has become a vortex. // Not adding mask results in uniform state. tol := 1e-3 expectv("m", m.average(), vector(0, 0, -0.001), tol) 3-3.11.1/test/bubblepos.mx3000066400000000000000000000021111503346766200153330ustar00rootroot00000000000000SetMesh(128, 128, 1, 1e-9,1e-9,0.4e-9, 1, 0, 0) Msat =580e3 Aex = 15e-12 enabledemag=true alpha = 0.1 Ku1=0.59e6+4*pi*1e-7*0.5*580e3*580e3 anisU=vector(0,0,1) Dind=0.0034089785 shiftregions=false m =neelskyrmion(1, -1).transl(-30e-9,0e-9,0) minimize() ext_bubbleMz = -1.0 //without compensating for in-plane tilts of the background this fails (corresponds to ext_backgroundtilt=0) ext_backgroundtilt=0.25 //default value TOL:=1e-9 expectv("position", ext_bubblepos.average(), vector(-3e-08,0,0), TOL) // add non-trivial geometry SetGeom(Square(30e-9).transl(30e-9,0e-9,0).inverse()) // no minimization to save time expectv("position", ext_bubblepos.average(), vector(-3e-08,0,0), TOL) SetGeom(Universe()) // reset to trivial SetMesh(128, 128, 1, 1e-9,1e-9,0.4e-9, 0, 1, 0) m =neelskyrmion(1, -1).transl(0e-9,-30e-9,0) minimize() expectv("position", ext_bubblepos.average(), vector(0,-3e-08,0), TOL) // add non-trivial geometry SetGeom(Square(30e-9).transl(0e-9,30e-9,0).inverse()) // no minimization to save time expectv("position", ext_bubblepos.average(), vector(0,-3e-08,0), TOL) 3-3.11.1/test/bubbleshiftpos.mx3000066400000000000000000000012021503346766200163710ustar00rootroot00000000000000SetMesh(128, 128, 1, 1e-9,1e-9,0.4e-9, 1, 1, 0) Msat =580e3 Aex = 15e-12 enabledemag=false alpha = 0.1 Ku1=0.59e6 anisU=vector(0,0,1) Dind=0.0034089785 shiftregions=true maxregion:=255 seed:=17 ext_makegrains(10e-9, maxregion, seed) for i:=0; i errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) x := r.X() ref := sin(kx*x) * cos(kx*x) ref2 := (cos(kx*x)*cos(kx*x) - sin(kx*x)*sin(kx*x)) val := Fmel.get(0, ii, jj, kk) val2 := Fmel.get(1, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max((ΔFmel).x)", errx, 0., ERRMAX) expect("max((ΔFmel).y)", erry, 0., ERRMAX) expect("((ΔFmel).x)@center", ex, 0., ERRMIN) expect("((ΔFmel).y)@center", ey, 0., ERRMIN) 3-3.11.1/test/mel-force-dmxdx-dmzdx.mx3000066400000000000000000000030711503346766200175030ustar00rootroot00000000000000Nx := 1024 Ny := 16 Nz := 16 csX := 0.5e-9 csY := 1e-9 csZ := 2e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) x := r.X() mx := sin(kx * x) mz := cos(kx * x) mask.setVector(ii, jj, kk, vector(mx, 0.0, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf erry := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) x := r.X() ref := sin(kx*x) * cos(kx*x) ref2 := (cos(kx*x)*cos(kx*x) - sin(kx*x)*sin(kx*x)) val := Fmel.get(0, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) x := r.X() ref := sin(kx*x) * cos(kx*x) ref2 := (cos(kx*x)*cos(kx*x) - sin(kx*x)*sin(kx*x)) val := Fmel.get(0, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,y)", erry, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,y)", ey, 0., ERRMIN) 3-3.11.1/test/mel-force-dmydy-dmxdy.mx3000066400000000000000000000030711503346766200175040ustar00rootroot00000000000000Nx := 16 Ny := 1024 Nz := 16 csX := 1e-9 csY := 0.5e-9 csZ := 2e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() mx := cos(kx * y) my := sin(kx * y) mask.setVector(ii, jj, kk, vector(mx, my, 0.0)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf erry := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,y)", erry, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,y)", ey, 0., ERRMIN) 3-3.11.1/test/mel-force-dmydy-dmzdy.mx3000066400000000000000000000030711503346766200175060ustar00rootroot00000000000000Nx := 16 Ny := 1024 Nz := 16 csX := 1e-9 csY := 0.5e-9 csZ := 2e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() mz := cos(kx * y) my := sin(kx * y) mask.setVector(ii, jj, kk, vector(0.0, my, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf erry := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,y)", erry, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,y)", ey, 0., ERRMIN) 3-3.11.1/test/mel-force-dmzdz-dmxdz.mx3000066400000000000000000000030711503346766200175070ustar00rootroot00000000000000Nx := 16 Ny := 16 Nz := 1024 csX := 1e-9 csY := 2e-9 csZ := 0.5e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() mx := cos(kx * z) mz := sin(kx * z) mask.setVector(ii, jj, kk, vector(mx, 0.0, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf errz := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ez > errz { errz = ez } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,z)", errz, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,z)", ez, 0., ERRMIN) 3-3.11.1/test/mel-force-dmzdz-dmydz.mx3000066400000000000000000000030711503346766200175100ustar00rootroot00000000000000Nx := 16 Ny := 16 Nz := 1024 csX := 1e-9 csY := 2e-9 csZ := 0.5e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() my := cos(kx * z) mz := sin(kx * z) mask.setVector(ii, jj, kk, vector(0.0, my, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf errz := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(1, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ez > errz { errz = ez } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(1, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,z)", errz, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,z)", ez, 0., ERRMIN) 3-3.11.1/test/memleak.mx3000066400000000000000000000003601503346766200147750ustar00rootroot00000000000000/* Test for memory leaks when resizing. */ c := 5e-9 SetGridSize(1024, 1024, 2) SetCellSize(c, c, c) m = uniform(1,1,1) Aex = 13e-12 Msat = 800e3 for i:=0; i<30; i++{ SetGridSize(128, 128+2*i, 1) SetCellSize(c, c, c) Steps(100) } 3-3.11.1/test/mfm.mx3000066400000000000000000000005021503346766200141370ustar00rootroot00000000000000/* Save an mfm image */ Nx := 400 Ny := 400 c := 2e-9 setpbc(2, 0, 0) setgridsize(Nx, Ny, 1) setcellsize(c, c, c) Msat = 1/mu0 setgeom(rect(400e-9, 400e-9).transl(-400e-9, 0, 0)) m = uniform(1,0,0.1) MFMLift = 50e-9 save(MFM) expect("mfm", MFM.Average(), -3.28009e7, 1e5) // golden value with mumax3.9.1 2015-12-05 3-3.11.1/test/mfm_sizes.mx3000066400000000000000000000012621503346766200153600ustar00rootroot00000000000000/* Checks if MFM images can be saved for all thicknesses with and without PBC. There was once a bug (#93) where Nz=2,3,4,5 could not generate an MFM image. */ Nx := 63 Ny := 64 c := 4e-9 Msat = 1 / mu0 Aex = 10e-12 for Nz := 1; Nz < 12; Nz++ { SetGridSize(Nx, Ny, Nz) SetCellSize(c, c, c) SetGeom(Zrange(-NZ*c/2, c/2).Intersect(Circle(Nx * c).Add(Xrange(-c*Nx/4, c*Nx/4)))) m = vortex(1, 1) SetPBC(0, 0, 0) SaveAs(m, sprintf("m_%d.ovf", Nz)) MFMLift = 40e-9 // forces the calculation of the MFM kernel. SnapshotAs(MFM, sprintf("MFM_%d.jpg", Nz)) SetPBC(2, 2, 0) MFMLift = 40e-9 // forces the calculation of the MFM kernel. SnapshotAs(MFM, sprintf("MFM_%d_PBC.jpg", Nz)) } 3-3.11.1/test/minimizer-stress.mx3000066400000000000000000000004471503346766200167140ustar00rootroot00000000000000// stress-test the minimizer for memleaks etc. setgridsize(128, 128, 1) setcellsize(3e-9, 3e-9 , 3e-9) Aex = 13e-12 Msat = 800e3 alpha = 0.02 m = uniform(-1, .1, 0) MinimizerStop = 1e-3 // make it go fast for B:=0.0; B<10e-3; B+=0.1e-3{ B_ext = vector(B, 0, 0) minimize() } 3-3.11.1/test/minimizer.mx3000066400000000000000000000007251503346766200153720ustar00rootroot00000000000000/* */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 1600e3 Aex = 13e-12 E_total.get() // try to trigger bad lex2 Msat = 800e3 alpha = 0.02 m = uniform(1, .1, 0) minimize() save(m) // Minimize is non-deterministic: see issue #354 for the reason behind these tolerances expect("mx", m.comp(0).average(), 0.966962, 1e-5) expect("my", m.comp(1).average(), 0.12529, 3e-5) expect("mz", m.comp(2).average(), 0., 0.) 3-3.11.1/test/msatzero.mx3000066400000000000000000000013631503346766200152320ustar00rootroot00000000000000/* This verifies that cells with Msat=0 do not - interact with neighbouring Msat≠0 cells through the exchange interaction. - react to the demagnetising field of nearby magnets. */ Msat = 0.48e6 Aex = 7e-12 size := 5e-9 // Well within exchange length of 7nm SetGridSize(16, 32, 1) SetCellSize(size, size, size) // Set region with Msat=0 DefRegion(1, XRange(0, inf)) Msat.SetRegion(1,0) // Relax from controlled random state m = RandomMagSeed(3) // Set reproducible seed relax() // Check whether... ExpectV("Normal mag", m.Region(0).Average(), Vector(0,-1,0), 1e-3) // ... no exchange exists between Msat≠0 and Msat=0 ExpectV("Random mag", m.Region(1).Average(), Vector(0,0,0), 3e-2) // ... Msat=0 cells do not experience demag 3-3.11.1/test/nodemagspins.mx3000066400000000000000000000030141503346766200160500ustar00rootroot00000000000000 setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, 1, 1) LEFT := 0 RIGHT := 1 AIR := 2 defregion(LEFT, xrange(-inf, 0)) defregion(RIGHT, xrange(0, inf)) TOL := 1e-4 expectv("L", B_demag.Region(LEFT).Average(), vector(-0.005299, -0.022127, -0.552248), TOL) expectv("R", B_demag.Region(RIGHT).Average(), vector(-0.005299, -0.022127, -0.552248), TOL) NoDemagSpins.SetRegion(LEFT, 0) NoDemagSpins.SetRegion(RIGHT, 1) expectv("L", B_demag.Region(LEFT).Average(), vector(-0.0103589, -0.02122757, -0.5480939), TOL) expectv("R", B_demag.Region(RIGHT).Average(), vector(0, 0, 0), 0) NoDemagSpins.SetRegion(LEFT, 1) expectv("L", B_demag.Region(LEFT).Average(), vector(0, 0, 0), 0) expectv("R", B_demag.Region(RIGHT).Average(), vector(0, 0, 0), 0) FixDt = 1e-14 steps(1000) // test for memleaks airgap := yrange(0, inf) SetGeom(airgap.inverse()) defregion(AIR, airgap) expectv("L", B_demag.Region(LEFT).Average(), vector(0, 0, 0), 0) expectv("R", B_demag.Region(RIGHT).Average(), vector(0, 0, 0), 0) expectv("A", B_demag.Region(AIR).Average(), vector(0, 0, 0), 0) NoDemagSpins.SetRegion(LEFT, 0) NoDemagSpins.SetRegion(RIGHT, 0) expectv("L", B_demag.Region(LEFT).Average(), vector(-0.0046074, -0.0391400, -0.53594726), TOL) expectv("R", B_demag.Region(RIGHT).Average(), vector(-0.0046074, -0.0391400, -0.53594726), TOL) expectv("A", B_demag.Region(AIR).Average(), vector(-0.000692, 0.0170129, -0.01630170), TOL) NoDemagSpins.SetRegion(LEFT, 1) steps(1000) // test for memleaks 3-3.11.1/test/openbc.mx3000066400000000000000000000052211503346766200146310ustar00rootroot00000000000000/* Test if the canting at the end of a nanowire corresponds to the 1D analytical result if open boundary conditions are used. This test is similar to the standard test proposed in arXiv:1803.11174 If the nanowire consists only out of one row of cells, the analytical canting matches with the simulated canting if open (or periodic) boundary conditions are used. The Neumann BC yields a different canting. This does not mean that Neumann BC are wrong. To be more precise, the analytical result, as well as the numerical results obtained with open and Neumann BC are slightly wrong because the width of the nanowire is not taken into account properly. */ ncell := 1024 cs := 0.05 verbose := false DMI := 0.9 *4/pi // 90% of critical DMI strength Dind = DMI enabledemag = false AnisU = vector(0,0,1) Aex = 1. Ku1 = 1. Msat = 1. minimizerstop = 1e-7 // --- Along the x direction -------------------------------------------------------- setgridsize(ncell,1,1) setcellsize(cs,cs,cs) m = uniform(0,0,1) // ANALYTIC theta0 := asin(DMI/2) cant_analytic := 2*atan(exp(-cs/2)*tan(theta0/2)) // shift towards center of the cell // NEUMANN BC openbc = false minimize() cant_neumann := atan2( m.getcell(0,0,0)[0], m.getcell(0,0,0)[2] ) // OPEN BC openbc = true minimize() cant_open := atan2( m.getcell(0,0,0)[0], m.getcell(0,0,0)[2] ) // PERIODIC BC openbc = false setpbc(0,1,0) minimize() cant_periodic := atan2( m.getcell(0,0,0)[0], m.getcell(0,0,0)[2] ) if verbose { print("Neumann: ", cant_neumann) print("Open: ", cant_open) print("Periodic: ", cant_periodic) print("Analytic: ", cant_analytic) } expect("edge canting", cant_open, cant_analytic, 1e-3) expect("edge canting", cant_open, cant_periodic, 1e-5) // --- Along the y direction -------------------------------------------------------- setgridsize(1,ncell,1) setcellsize(cs,cs,cs) setpbc(0,0,0) m = uniform(0,0,1) // ANALYTIC theta0 = asin(DMI/2) cant_analytic = 2*atan(exp(-cs/2)*tan(theta0/2)) // shift towards center of the cell // NEUMANN BC openbc = false minimize() cant_neumann = atan2( m.getcell(0,0,0)[1], m.getcell(0,0,0)[2] ) // OPEN BC openbc = true minimize() cant_open = atan2( m.getcell(0,0,0)[1], m.getcell(0,0,0)[2] ) // PERIODIC BC openbc = false setpbc(1,0,0) minimize() cant_periodic = atan2( m.getcell(0,0,0)[1], m.getcell(0,0,0)[2] ) if verbose { print("Neumann: ", cant_neumann) print("Open: ", cant_open) print("Periodic: ", cant_periodic) print("Analytic: ", cant_analytic) } expect("edge canting", cant_open, cant_analytic, 1e-3) expect("edge canting", cant_open, cant_periodic, 1e-5) 3-3.11.1/test/outputformat.mx3000066400000000000000000000007551503346766200161430ustar00rootroot00000000000000/* Save data with different output formats. */ setgridsize(32, 8, 2) setcellsize(1e-9, 1e-9, 1e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) steps(1) outputformat = OVF1_TEXT saveas(m, sprintf("mumax_ovf1_text")) outputformat = OVF1_BINARY saveas(m, sprintf("mumax_ovf1_binary")) outputformat = OVF2_TEXT saveas(m, sprintf("mumax_ovf2_text")) step=4 outputformat = OVF2_BINARY saveas(m, sprintf("mumax_ovf2_binary")) step=5 outputformat = DUMP saveas(m, sprintf("mumax_dump"))3-3.11.1/test/pbc1.mx3000066400000000000000000000011161503346766200142070ustar00rootroot00000000000000/* Test correct wrapping for exchange with PBC. */ setpbc(2, 2, 0) Nx := 128 Ny := Nx/2 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 1000e3 Aex = 10e-12 alpha = 1 r := rect(Nx/2*c, Ny/2*c) deltax := Nx/2*c deltay := Ny/2*c setgeom( r.repeat(Nx*c, Ny*c, 0).transl(deltax, deltay, 0) ) m = uniform(1, 0.1, 0.01) save(m) run(1e-9) save(m) expectv("m", m.average(), vector(0.89947968, 0.23352228, -0.00010287), 1e-3) setgeom( r.repeat(Nx*c, Ny*c, 0)) m = uniform(1, 0.1, 0.01) run(1e-9) expectv("m", m.average(), vector(0.89947968, 0.23352228, -0.00010287), 1e-3) 3-3.11.1/test/pbc2.mustfail000066400000000000000000000012541503346766200153300ustar00rootroot00000000000000// test correct wrapping for DMI setpbc(1, 0, 0) Nx := 128 Ny := Nx/2 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 1000e3 Aex = 10e-12 alpha = 1 defregion(1, yrange(-inf, 0)) r := rect(Nx/2*c, Ny/2*c) dx := Nx/2*c dy := Ny/2*c m = vortex(1, 1) save(m) run(1e-9) save(m) m1 := average(m.region(1)) expect("mx", m1[0], 0.8146139383, 1e-5) expect("my", m1[1], -0.0001059844, 1e-5) expect("mz", m1[2], -0.0003330991, 1e-5) m = vortex(1, 1) Dex = 1e-20 // should not make a difference save(m) run(1e-9) save(m) m1 = average(m.region(1)) expect("mx", m1[0], 0.8146139383, 1e-5) expect("my", m1[1], -0.0001059844, 1e-5) expect("mz", m1[2], -0.0003330991, 1e-5) 3-3.11.1/test/quantities.mx3000066400000000000000000000006731503346766200155570ustar00rootroot00000000000000/* Test quantity averages. */ Nx := 200 Ny := 100 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) setGeom(circle(Nx*c)) defRegion(0, rect(Nx*c, Nx*c)) defRegion(1, rect(Nx*c, Nx*c/4)) Msat = 1e6 expect("Msat", Msat.Average(), 1e6, 1) Aex = 20e-12 expect("Aex", Aex.Average(), 20e-12, 1e-16) alpha = 1 expect("alpha", alpha.Average(), 1, 1e-5) anisC1 = vector(1, 2, 3) expectV("anisC1", anisC1.Average(), vector(1, 2, 3), 1e-5) 3-3.11.1/test/racetest.bash000077500000000000000000000004251503346766200154070ustar00rootroot00000000000000#! /bin/bash # builds with -race and runs tests with browser open. set -e go install -race github.com/mumax/3/cmd/mumax3 google-chrome http://localhost:35367 & for f in *.mx3; do mumax3 $f done go install github.com/mumax/3/cmd/mumax3 # re-build without race detector 3-3.11.1/test/randregions.todo000066400000000000000000000003231503346766200161320ustar00rootroot00000000000000 setgridsize(30, 20, 2) setcellsize(1e-9, 1e-9, 1e-9) for x:=0; x<30; x++{ for y:=0; y<20; y++{ randRegion := randInt(256) for z:=0; z<2; z++{ defRegionCell(randRegion, x, y, z) } } } save(regions) 3-3.11.1/test/redefregion.mx3000066400000000000000000000046501503346766200156610ustar00rootroot00000000000000/* Test the RedefRegion function. */ Nx := 128 Ny := 64 Nz := 1 c := 1e-9 SetGridSize(Nx, Ny, Nz) SetCellSize(c, c, c) m = Uniform(0, 0, 1) // Assign the left-hand half of the sample region id 1, and set the magnetization within it along +x DefRegion(1, Rect(Nx*c/2, Ny*c).Transl(-Nx*c/4, 0, 0)) m.SetRegion(1, Uniform(1, 0, 0)) // Left-hand-side region id reset to zero SnapshotAs(regions, "regions_1_before.png") RedefRegion(1, 0) SnapshotAs(regions, "regions_1_redefined.png") // "Reset" the system to its initial state m = Uniform(0, 0, 1) // Now the right-hand-side of the system ONLY should have id 1 DefRegion(1, Rect(Nx*c/2, Ny*c).Transl(Nx*c/4, 0, 0)) m.SetRegion(1, Uniform(1, 0, 0)) // Ensure that RHS now has id 1 and LHS is back to zero Expect("Region", regions.GetCell(Nx/4, 0, 0), 0, 0.1) Expect("Region", regions.GetCell(3*Nx/4, 0, 0), 1, 0.1) ExpectV("m", m.GetCell(Nx/4, Ny/2, 0), Vector(0, 0, 1), 1e-5) ExpectV("m", m.GetCell(3*Nx/4, Ny/2, 0), Vector(1, 0, 0), 1e-5) // Reset the system to initial state, then set the (now undefined) region 1 to have m along +y, and ensure that this does not affect average magnetization SnapshotAs(regions, "regions_2_before.png") RedefRegion(1, 0) SnapshotAs(regions, "regions_2_redefined.png") m = Uniform(0, 0, 1) m.SetRegion(1, Uniform(0, 1, 0)) Expect("m", m.comp(2).average(), 1, 1e-5) // Test with several regions: start with different magnetizations; add first region to second, set them both to a single direction, and check both regions now have this RedefRegion(1, 0) SnapshotAs(regions, "regions_2_redefinedagain.png") DefRegion(1, Rect(Nx*c/2, Ny*c/2).Transl(Nx*c/4, Ny*c/4, 0)) m.SetRegion(1, Uniform(0, 1, 0)) DefRegion(2, Rect(Nx*c/2, Ny*c/2).Transl(Nx*c/4, -Ny*c/4, 0)) m.SetRegion(2, Uniform(0, -1, 0)) SnapshotAs(regions, "regions_3_before.png") RedefRegion(1, 2) SnapshotAs(regions, "regions_3_redefined.png") m.SetRegion(2, Uniform(1, 0, 0)) ExpectV("m", m.GetCell(3*Nx/4, 3*Ny/4, 0), Vector(1, 0, 0), 1e-5) ExpectV("m", m.GetCell(3*Nx/4, Ny/4, 0), Vector(1, 0, 0), 1e-5) // Test with resized grid: start from previous situation. If regions.hist is not tracked correctly in the Go source, this will not give the correct result. SetGridSize(2*Nx, 2*Ny, Nz) SnapshotAs(regions, "regions_3_resized.png") Expect("Region", regions.GetCell(3*Nx/4, Ny/2, 0), 0, 0.1) Expect("Region", regions.GetCell(5*Nx/4, 3*Ny/2-1, 0), 2, 0.1) Expect("Region", regions.GetCell(5*Nx/4, Ny/2, 0), 2, 0.1) 3-3.11.1/test/reduced.todo000066400000000000000000000007031503346766200152340ustar00rootroot00000000000000// test derived output quantities Nx := 64 Ny := 64 Nz := 2 c := 4e-9 SetGridSize(Nx, Ny, Nz) SetCellSize(c, c, c ) DefRegion(0, xrange(-inf, 0)) DefRegion(1, xrange(0, inf)) m = Vortex(1, 1) tableadd(m.avgregion(0)) tableadd(m.avgregion(1)) tablesave() expect("m0x", m0.getVec()[0], 0 , 1e-6) expect("m0y", m0.getVec()[1], -0.323819, 1e-6) expect("m1x", m1.getVec()[0], 0 , 1e-6) expect("m1y", m1.getVec()[1], 0.323819, 1e-6) 3-3.11.1/test/regions.mx3000066400000000000000000000007661503346766200150420ustar00rootroot00000000000000/* Regions test */ N := 128 setgridsize(N, N, 1) setcellsize(1e-9, 1e-9, 1e-9) d := 1e-9 * N defregion(0, circle(d).inverse()) defregion(1, circle(d)) defregion(2, circle(d/2)) defregion(3, circle(d/2).transl(d/3, 0, 0)) save(regions) Ku1.setregion(1, 500) Ku1.setregion(2, 0) Ku1.setregion(3, -500) save(Ku1) Msat = 800e-3 // sets it everywhere Msat.setregion(2, 500e3) save(Msat) m = uniform(1, 0, 0) save(m) regions.setcell(5, 6, 0, 123) expect("getcell", regions.getcell(5, 6, 0), 123, 0) 3-3.11.1/test/regions2.todo000066400000000000000000000011031503346766200153440ustar00rootroot00000000000000// Regression test for Jonathan's pinning simulations d := 3.125e-9 h := 5e-9 SetGridsize(64, 64, 2) SetCellsize(d, d, h) setgeom( cylinder(190e-9, 190e-9).transl(0.5*d,0.5*d,0) ) Msat = 860e3 Aex = 13e-12 defregion(2, cuboid(3*d,3*d,h).transl(0.5*d,0.5*d,0.5*h)) Msat.setregion(2, 0.0) les := sqrt(2*13e-12/860e3) overrideLex(1,2, les) alpha = 0.8 m = vortex(1, -1).transl(5e-9, 0, 0) run(1e-9) m_ := average(m) expect("mx", m_[0], 1.043081283569336e-07, 1e-6) expect("my", m_[1], -3.903551260009408e-09, 1e-6) expect("mz", m_[2], -0.0037489673122763634, 1e-6) 3-3.11.1/test/regionsload.mx3000066400000000000000000000010251503346766200156670ustar00rootroot00000000000000/* Test regions.loadfile */ setgridsize(256, 128, 2) setcellsize(5e-9, 5e-9, 5e-9) defregion(1, circle(500e-9)) defregion(2, circle(500e-9).inverse()) m.setRegion(1, uniform(1, 0, 0)) m.setRegion(2, uniform(0, 1, 0)) // save regions to disk saveAs(regions, "regions.ovf") // overwrite regions defregion(256, universe()) // re-load previous state from disk regions.loadFile("testdata/regions.ovf") // check expectv("m1", m.region(1).average(), vector(1, 0, 0), 1e-5) expectv("m2", m.region(2).average(), vector(0, 1, 0), 1e-5) 3-3.11.1/test/regression001.mx3000066400000000000000000000042041503346766200157640ustar00rootroot00000000000000/* Regression test for bug reported by Ezio Iacocca okt 2013 failed on GTX TITAN, presumably because of race conditions. Fixed since xyz branch. */ Nx := 256 Ny := 256 Nz := 1 SetGridsize(Nx, Ny, Nz) CellX := 1500e-9/Nx CellY := 1500e-9/Ny CellZ := 5e-9 SetCellsize(CellX, CellY, CellZ) // DEFINE CONTACT posX := 0.e-9 posY := 0.e-9 Rc := 35.e-9 I := -30e-3 // DEFINE BOUNDARY ABC := 200e-9 factor := 100 // DEFINE APPLIED FIELD Happ := 0 // 0.965 Angle := 70 * pi / 180 // SET MATERIAL PARAMETERS alpha_v := 0.01 Msat_v := 700e3 MsatP := 1200e3 Aex_v := 10e-12 // ADJUST SOLVER MaxDt = 1e-12 MinDt = 1e-18 //MaxErr = 1e-5 // SET GEOMETRY AND REGIONS defregion(1, Ellipse(Rc, Rc).transl(posX,posY,0)) defregion(2, Ellipse(Rc, Rc).transl(posX,posY,0).inverse()) alpha_reg := ceil(ABC/CellX) for i:=0; i= Rc{ b = r.cross(current).mul(mu0 / (2*pi*pow(r.len(),2)) ) }else{ b = r.cross(current).mul(mu0 / (2*pi*pow(Rc,2)) ) } for k:=0; k 5e-4) alpha = 0.01 A := 10*1.0e-3 f := 6.88495e9 mSavetime := 1.0/(8.0*f) t0 := t B_ext = vector( 0.0, 0.0, A*sin(2*pi*f*(t-t0))*(1.0 - exp(-2*pi*f*(t-t0))) ) spot := circle(diameter/50.0).transl(diameter/5.0, diameter/7.0, 0.0) defregion(1, spot) run(0.5e-9) tol := 0.01 expectv("m", m.region(1).average(), vector(-0.450, 0.8869750613257998, 0.0804117293584914), tol) run(0.5e-9) expectv("m", m.region(1).average(), vector( -0.7082362402053106, 0.6975658053443545, -0.1068760781061081), tol) 3-3.11.1/test/regression003.mx3000066400000000000000000000007661503346766200157770ustar00rootroot00000000000000/* Regression test for bug with region > 127 which was turned into negative number. */ setgridsize(512, 512, 1) c := 4e-9 setcellsize(c, c, c) m = uniform(1,0,0) ext_makegrains(40e-9, 255, 0) defregion(1, circle(200*c)) defregion(2, circle(100*c)) defregion(128, circle(50*c)) expect("0", Ku1.average(), 0, 0) expect("0", Kc1.average(), 0, 0) expectv("0", AnisC1.average(), vector(0, 0, 0), 0) expectv("0", AnisC2.average(), vector(0, 0, 0), 0) expectv("0", AnisU.average(), vector(0, 0, 0), 0) 3-3.11.1/test/regression006.mx3000066400000000000000000000017361503346766200160000ustar00rootroot00000000000000/* Regression test for buggy intergrain exchange reported by Jonathan Leliaert */ setgridsize(256, 64, 1) setcellsize(3.125e-9, 3.125e-9, 15e-9) Msat = 860e3 Aex = 13e-12 Xi = 0.0 m = vortexwall(1,-1,1,1).scale(1.5, 1, 1) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. ext_rmSurfaceCharge(0, 1, -1) // Relax Alpha = 3 // high damping for fast relax RunWhile(MaxTorque > 1e-3) // relax Alpha = 0.02 // restore normal damping // Set post-step function that centers simulation window on domain wall. ext_centerwall(0) ext_makegrains(4e-8, 10, 0) for i :=0; i<10;i+=1{ for j :=i+1; j<10;j+=1{ ext_ScaleExchange(i, j, 0.5) } } // Run the simulation with current through the sample Pol =0.56 J = vector(-10e12 , 0, 0) Run(0.5e-9) expectv("m", m.average(), vector(0.013319221, 0.018588585, 0.00010564699186943471), 1e-4) 3-3.11.1/test/regression007.mx3000066400000000000000000000015201503346766200157700ustar00rootroot00000000000000/* Reported by Jonathan Lelieart Zhang-Li used to give NaN's because of division by Bsat */ setgridsize(128, 32, 1) setcellsize(3.125e-9, 3.125e-9, 10e-9) Msat = 860e3 Aex = 13e-12 Xi = 0.0 m = twodomain(1,0,0,0,1,0,-1,0,0) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. ext_rmSurfaceCharge(0, 1, -1) ext_makegrains(1e-9,25,0) msat.setregion(12, 0) // Set post-step function that centers simulation window on domain wall. ext_centerwall(0) tableadd(ext_dwpos) // domain wall position tableautosave(10e-12) autosave(m, 2e-10) autosave(regions, 2e-10) // Run the simulation with current through the sample Pol =0.56 J = vector(-8e12 , 0, 0) Steps(10) expectv("m", m.average(), vector(0.00035, 0.032942, 0), 1e-3) 3-3.11.1/test/regression008.todo000066400000000000000000000021051503346766200162270ustar00rootroot00000000000000// NaN's when slonczewski parameters not OK Nx := 64 Ny := 32 Nz := 1 sX := 160e-9 sY := 80e-9 sZ := 5e-9 setgridsize(Nx, Ny, Nz) setcellsize(sX/Nx, sY/Ny, sZ/Nz) Msat = 800e3 Aex = 13e-12 alpha = 1 maxdt = 1e-12 // Set a initial magnetisation to C-state m = uniform(1, 1, 0.001) run(3e-9) alpha = 0.01 lambda = 1 Pol = 0.5669 epsilonprime = 0 pdeg := 1 prad := pdeg * pi / 180.0 px := cos(prad) py := sin(prad) fixedlayer = vector(px, py, 0) Jtot := -0.008 // total current in amps carea := sX * sY jc := Jtot / carea print("Current density is: " , jc) J = vector(0, 0, jc) //autosave(m, 1e-12) tableautosave(10e-12) save(sttorque) m0 := m.average() expect("mx", m0[0], 0.9586285, 1e-3) expect("my", m0[1], 0.2039081, 1e-3) expect("mz", m0[2], 0.0000000, 1e-3) run(1e-9) m1 := m.average() expect("mx", m1[0], 0.6440672, 1e-3) expect("my", m1[1], 0.5133638, 1e-3) expect("mz", m1[2],-0.1571195, 1e-3) run(1e-9) m2 := m.average() expect("mx", m2[0], -0.9574024, 1e3) expect("my", m2[1], 0.2069624, 1e3) expect("mz", m2[2], 0.0096634, 1e3) 3-3.11.1/test/regression009.mx3000066400000000000000000000004011503346766200157670ustar00rootroot00000000000000/* Regression test for crash on shift after pbc change */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) alpha = 3 steps(10) shift(1) setpbc(1, 1, 0) //steps(10) shift(1) 3-3.11.1/test/regression010.mx3000066400000000000000000000016561503346766200157740ustar00rootroot00000000000000/* Regression test for crash after resize when adding to exciations. */ Nx := 32 Ny := 16 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 800e3 Aex = 13e-12 m = uniform(1, 0, 0) mask := newSlice(3, Nx, Ny, 1) wireX := 0e-9 wireY := 0e-9 wireDiam := 50e-9 current := vector(0, 0, 1) for i:=0; i= wireDiam{ b = r.cross(current).mul(mu0 / (2*pi*pow(r.len(), 2))) }else{ relDist := r.len() / wireDiam innerCurrent := current.mul(relDist * relDist) b = r.cross(innerCurrent).mul(mu0 / (2*pi*pow(r.len(), 2))) } mask.set(0, i, j, 0, b[0]) mask.set(1, i, j, 0, b[1]) mask.set(2, i, j, 0, b[2]) } } B_ext.add(mask, 0.1) alpha = 3 steps(1) setgridsize(Nx*2, Ny, 1) setcellsize(2*c, 2*c, 2*c) steps(1) 3-3.11.1/test/regression011.mx3000066400000000000000000000005041503346766200157640ustar00rootroot00000000000000/* Binary boolean operations were missing (reported by Gabriel Chaves). Unary ! was missing (reported by Raffaele Pellicelli) */ setgridsize(1, 1, 1) setcellsize(1, 1, 1) t0 := 0 timestep := 1 Tq := 1e-3 b := (((t-t0) < timestep ) && ( (Tq > 1e-4) || ( t < 1.0e-13))) if !true{ expect("should_not_happen", 0, 1, 0) } 3-3.11.1/test/regression013.mx3000066400000000000000000000011361503346766200157700ustar00rootroot00000000000000/* Shift should not act on regions if ShiftRegios == false Reported by Ben Van de Wiele. */ Ny := 64 Nx := 16*Ny c := 3.125e-9 setgridsize(Nx, Ny, 1) setcellsize(c, c, 15e-9) Msat = 1.7e6 Aex = 21e-12 Alpha = 0.015 Ku1 = 25e4 DefRegion( 1, XRange(-inf, 0) ) DefRegion( 2, XRange(0, inf) ) anisU.SetRegion(1, vector(1, 1, 0)) anisU.SetRegion(2, vector(1, -1, 0)) m = uniform(1, 0, 0) // shift the magnetization every 1e-10 sec shiftM = True shiftRegions = False for i:=0; i<320; i++{ shift(-1) } expect("regions", regions.Average(), 1.5, 0) 3-3.11.1/test/regression015.mx3000066400000000000000000000007071503346766200157750ustar00rootroot00000000000000// Regression test for vortex config yielding NaN in case of an odd number of cells. // Reported by Mathias Helsen. Nx := 257 Ny := 257 Nz := 1 setgridsize(Nx, Ny, Nz) thickness := 10.0e-9 diameter := 1.0e-6 setcellsize(diameter/Nx, diameter/Ny, thickness/Nz) setgeom(cylinder(diameter, thickness)) mask := newslice(3, Nx, Ny, Nz) mask.set(2, 127, 127, 0, 1.0) m = vortex(1, 1) expectv("m", m.average(), vector(0, 0, 0.0001), 1e-2) 3-3.11.1/test/regression016.mx3000066400000000000000000000005571503346766200160010ustar00rootroot00000000000000// test httpfs saveas: should not create "http:/..." directory // reported by Jonathan Leliaert. Gsize := 20 Ared := 0.75 Delta_x := 450.0/128.0 SetGridsize(128, 128, 1) SetCellsize(Delta_x*1e-9, Delta_x*1e-9, 30e-9) alpha = 0.02 Msat = 860e3 Aex = 13e-12 m = Vortex(1, 1) Mstring := sprint("m_Relaxed_Ared=", Ared, "_Gsize=", Gsize, ".ovf") saveAs(m, Mstring) 3-3.11.1/test/regression017.mx3000066400000000000000000000005561503346766200160010ustar00rootroot00000000000000/* Test for tripping the solver with an instable equilibrium start magnetization. Reported by Mykola Dvornik, Feb. 2015. */ setgridsize(5, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 1.3e6 alpha = 0.02 m = uniform(0.0, 0.0, 1.0) relax() m = uniform(0.0, 0.0, -1.0) relax() // used to fail here. 3-3.11.1/test/regression018.mx3000066400000000000000000000031321503346766200157730ustar00rootroot00000000000000/* Problem reported by Gabriel Chaves: CURAND error + httpfs touch did not close file */ SetSolver(2) FixDt = 1e-14 Bval := 0.0 mos := -1.0 kval := 278607 fwidth := 6.000000e-08 fellipseaxis := 2.400000e-07 flength := 2.400000e-07 ThermSeed(128) Temp =300.000000 fthickness := 1.7e-9 sr := fellipseaxis/fwidth xcellsize := 2.00e-9 ycellsize := 2.00e-9 zcellsize := 1.70e-9 nx := floor(flength/xcellsize) ny := floor(fwidth/ycellsize) print("grid:", nx, ny) setgridsize(128, 32, 1) setcellsize(xcellsize, ycellsize, zcellsize) /* Input parameters */ Msat = 817647 Aex = 13e-12 alpha = 0.01 // Gilbert damping constant B_ext = vector(0.0,0,0) // Applied field in T a := cylinder(fellipseaxis,zcellsize).scale(1.0,1.0/sr,1.0).transl((fellipseaxis-flength)/2.0,0e-9, 0 ) b := cylinder(fellipseaxis,zcellsize).scale(1.0,1.0/sr,1.0).transl((-fellipseaxis+flength)/2.0,0e-9, 0 ) c := cuboid(flength-fellipseaxis,fwidth,zcellsize).scale(1.0,1.0,1.0).transl(0e-9,0e-9, 0 ) setgeom( a.add(b).add(c)) defregion(1,a.add(b).add(c)) m.setRegion(1, uniform(-1, 1e-3, 1e-3)) FixedLayer.setRegion(1,vector (1,0,0)) anisU = vector (0,0,1) tableAdd(Ku1) tableAdd(m) tableAdd(E_total) tableAdd(E_anis) tableAdd(E_exch) tableAdd(E_Zeeman) tableAdd(E_demag) tableAdd(B_ext) tableAdd(FixedLayer.Region(1)) B_ext = vector(Bval,0, 0) timestep := 10.0e-9 //autosave(m, 1e-12) //tableadd(dt) //tableautosave(1e-15) for i:=0; i<1000; i++ { steps(1) fprintln("dt.txt", Neval.get(), dt) } Ku1 = kval //m.loadfile("initialmstate.out/m000000.ovf") m.set(uniform(1,1,1)) B_ext = vector (Bval,0.0,0.0) count := 0 //run(timestep) 3-3.11.1/test/relax-stress.mx3000066400000000000000000000003171503346766200160200ustar00rootroot00000000000000// stress-test relax for memleaks etc. setgridsize(128, 128, 1) setcellsize(3e-9, 3e-9 , 3e-9) Aex = 13e-12 Msat = 800e3 alpha = 0.02 m = uniform(-1, .1, 0) for i:=0; i<100; i++{ relax() } 3-3.11.1/test/repeat.mx3000066400000000000000000000004151503346766200146430ustar00rootroot00000000000000/* Test Shape.repeat() */ N := 128 c := 5e-9 SetGridSize(N, N, 1) SetCellSize(c, c, c) d := 20*c SetGeom(circle(d).repeat(2*d, d, 0)) m = uniform(1, 0, 0) Msat = 800e3 Aex = 13e-12 alpha = 1 steps(1000) expectv("m", m.average(), vector(0.525, 0, 0), 1e-2) 3-3.11.1/test/resize.mx3000066400000000000000000000007711503346766200146710ustar00rootroot00000000000000/* Test magnetization stretch upon resize. */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) alpha = 3 //MaxErr = 1e-4 RunWhile(MaxTorque > 1e-4) run(1e-9) expectv("m", m.average(), vector(0.96696, 0.12528, 0), 1e-3) setgridsize(128*2, 32*2, 2) setcellsize(500e-9/(128*2), 125e-9/(32*2), 3e-9/2) expectv("m", m.average(), vector(0.96696, 0.12528, 0), 1e-3) run(1e-9) expectv("m", m.average(), vector(0.96696, 0.12528, 0), 1e-3) 3-3.11.1/test/rk4.mx3000066400000000000000000000007461503346766200140720ustar00rootroot00000000000000/* Test rk4 solver with fixed and adaptive timestep */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // relax (rk4) setsolver(4) alpha = 3 run(1e-9) // reversal FixDt = 1e-14 alpha = 0.02 B_ext = vector(-24.6E-3, 4.3E-3, 0) run(0.1e-9) expectv("m", m.average(), vector(0.59293, 0.63278, -0.08152), 1e-3) FixDt=0 run(0.4e-9) expectv("m", m.average(), vector(-0.8740, -0.2713, 0.01795), 1e-3) 3-3.11.1/test/rk4temperature.mx3000066400000000000000000000005601503346766200163420ustar00rootroot00000000000000/* Test rk4 solver with temperature */ c := 10e-9 setcellsize(c, c, c) setgridsize(256, 256, 1) Msat = 1e6 Aex = 0 alpha = 0.1 AnisU = vector(0, 0, 1) m = uniform(0, 0, 1) fixdt = 2e-12 Temp = 100 Ku1 = 1e4/4 enabledemag = false setsolver(4) run(5e-9) print(m.average()) expectv("m", m.average(), vector(-0.000, -0.000, 0.8366), 1e-3) 3-3.11.1/test/rk56.mx3000066400000000000000000000007501503346766200141540ustar00rootroot00000000000000/* Test rk56 solver with fixed and adaptive timestep */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // relax (rk56) setsolver(6) alpha = 3 run(1e-9) // reversal FixDt = 1e-14 alpha = 0.02 B_ext = vector(-24.6E-3, 4.3E-3, 0) run(0.1e-9) expectv("m", m.average(), vector(0.59293, 0.63278, -0.08152), 1e-3) FixDt=0 run(0.4e-9) expectv("m", m.average(), vector(-0.8740, -0.2713, 0.01795), 1e-3) 3-3.11.1/test/rkky.mx3000066400000000000000000000007061503346766200143460ustar00rootroot00000000000000/* Test setting an absolute RKKY coupling. */ N := 10 setgridsize(N, N, 2) c := 1e-9 setcellsize(c, c, c) defRegion(0, layer(0)) defRegion(1, layer(1)) Msat = 1e6 Aex = 10e-12 RKKY := -1e-3 // 1mJ/m2 scale := (RKKY * c) / (2 * Aex.Average()) print("scale:", scale) ext_scaleExchange(0, 1, scale) m = uniform(1, 0, 0) E0 := E_total.Get() m.setRegion(0, uniform(0, 1, 0)) E1 := E_total.Get() expect("delta E", E1 - E0, RKKY * N*N*c*c, 1e-20) 3-3.11.1/test/rmsurfacecharge.mx3000066400000000000000000000004511503346766200165240ustar00rootroot00000000000000 setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Aex = 2*13e-12 Msat = 2*800e3 // about 2T m = uniform(1, .1, 0) ext_rmsurfacecharge(0, 1, 1) minimize() tol := 1e-5 // without surface charges, ground state should be uniform. expectv("m", m.Average(), vector(1, 0, 0), tol) 3-3.11.1/test/roughness.mx3000066400000000000000000000004711503346766200154020ustar00rootroot00000000000000/* Regression test for Roughness shape. */ Nx := 500 Ny := 200 Nz := 10 c := 1e-9 setgridsize(Nx, Ny, Nz) setcellsize(c, c, c) setgeom(ellipse(Nx*c, Ny*c).intersect(grainroughness(40e-9, 0, 5e-9, 123))) expect("volume", geom.average(), 0.577983, 1e-4) // this volume was OK so should not change unexpectedly 3-3.11.1/test/run.bash000077500000000000000000000003011503346766200143720ustar00rootroot00000000000000#! /bin/bash export MUMAX="$GOPATH/bin/mumax3" echo "Using the mumax3 executable at: ${MUMAX}" set -e $MUMAX -vet *.mx3 $MUMAX -paranoid=false -failfast -cache /tmp -f -http "" *.go *.mx3 3-3.11.1/test/run.ps1000066400000000000000000000006661503346766200141730ustar00rootroot00000000000000#! Run deploy/deploy_windows.ps1 first! This generates the following executable: $MUMAX = "$env:GOPATH\bin\mumax3.exe" # Enter the test directory to (re)compile the cuda kernels Set-Location ../test $mumaxfiles = Get-ChildItem -filter "*.mx3" -Name $mumaxandgofiles = Get-ChildItem -include ("*.mx3", "*.go") -Name & $MUMAX -vet $mumaxfiles & $MUMAX -paranoid=false -failfast -cache=/tmp -http="" -f=true $mumaxandgofiles3-3.11.1/test/runningaverage.mx3000066400000000000000000000033411503346766200163770ustar00rootroot00000000000000/* Test RunningAverage() of a (custom) Quantity (implemented in engine/customfield.go), by performing the same physical test as in thermometer.go ================ Checks if the measured temperature in a ferromagnetic PMA film is equal to the input temperature. We measure the temperature with the thermometer derived in PHYSICAL REVIEW E 82, 031111 (2010): T = (Vcell*Msat)/(2*kB) * <Σ||m x h||^2> / <Σ m.h > [1] The expectation values <...> are calculated by taking time averages. The sums Σ... are taken over the different cells. The input temperature is chosen to be 177K. We allow an error smaller than 5K. NOTE: The exchange energy in MuMax3 is shifted by a constant with respect to atomistic simulations. Due to this difference, we need to add the following constant value to the divisor of [1]: shift = 2 * (Aex/Msat) * NCell * ( 2/Δx² + 2/Δy² ) */ //// Create system c := 4e-9 Nxy := 128 SetGridSize(Nxy, Nxy, 1) SetCellSize(c, c, c) SetPBC(1, 1, 0) //// Set material parameters and initial state Msat = 580e3 Aex = 15e-12 AnisU = Vector(0, 0, 1) Ku1 = 0.6e6 Alpha = 0.1 Temp = 177 M = Uniform(0, 0, -1) Run(1e-10) //// Track average over 0.1ns h := Add(Add(B_demag, B_exch), B_anis) mxh := Cross(m, h) dmh := Dot(m, h) dmxh := Dot(mxh, mxh) divisor := RunningAverage(dmh) numerator := RunningAverage(dmxh) Run(1e-10) //// Check results: is temperature as expected? Vcell := c * c * c kB := 1.38064852e-23 // Boltzmann constant N := Nxy * Nxy offset := 2 * Aex.GetRegion(0) / Msat.Average() * N * (2/(c*c) + 2/(c*c)) temperature := (Vcell * Msat.Average() / (2 * kB)) * Sum(numerator) / (Sum(divisor) + offset) Expect("temperature", temperature, Temp.GetRegion(0), 5) 3-3.11.1/test/runwhile.mx3000066400000000000000000000005531503346766200152230ustar00rootroot00000000000000/* Test for runwhile(). */ N := 20 c := 4e-9 SetGridSize(N, N, 1) SetCellSize(c, c, c) SetGeom(circle(N*c)) m = uniform(1, 0, 0) Msat = 800e3 Aex = 13e-12 alpha= 1 RunWhile(MaxTorque > 1e-3) B_ext = vector(0, 0.01, 1) RunWhile(m.comp(1).average() < 0.5) expect("my", m.comp(1).average(), 0.53, 0.02) // unavoidable overshoot because of large time steps 3-3.11.1/test/savefile.mx3000066400000000000000000000027131503346766200151640ustar00rootroot00000000000000/* Test loading external data files. */ randommagseed(666) Nx := 128 Ny := 64 Nz := 32 setgridsize(Nx, Ny, Nz) c := 5e-9 setcellsize(c, c, c) Msat = 800e3 Aex = 13e-12 m = randommag() mref := m.GetCell(99, 50, 24) outputformat = OVF1_TEXT saveas(m, sprintf("ovf1t")) outputformat = OVF1_BINARY saveas(m, sprintf("ovf1b")) outputformat = OVF2_TEXT saveas(m, sprintf("ovf2t")) outputformat = OVF2_BINARY saveas(m, sprintf("ovf2b")) outputformat = DUMP saveas(m, sprintf("dump")) flush() // make sure output is saved before loading s := loadfile("savefile.out/ovf1t.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/ovf1b.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/ovf2t.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/ovf2b.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/dump.dump") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) 3-3.11.1/test/shifted.go000066400000000000000000000057711503346766200147210ustar00rootroot00000000000000//go:build ignore // +build ignore /* Test Shifted Quantity Shifted(Quantity q, dx, dy, dz int) shifts the input Quantity over (dx, dy, dz) cells. The shift is performed on the gpu when the Shifted quantity is being evaluated. This test checks if the shift is correctly performed. */ package main import ( "github.com/mumax/3/data" . "github.com/mumax/3/engine" "os" ) func main() { defer InitAndClose()() // arbitrarily chosen grid SetGridSize(32, 16, 8) SetCellSize(1.7, 3.2, 7) // arbitrarily chosen shift dx, dy, dz := -4, 3, 1 // arbitrarily created quantity q q := FunctionQuantity{func(r data.Vector) float64 { return 3*r.X() - r.Y() + r.Z()*r.Z() }} // Evaluate Shifted(q, dx, dy, dz) and copy the result to the host shiftedOnGpu := func() *data.Slice { r := ValueOf(Shifted(q, dx, dy, dz)) defer r.Free() return r.HostCopy() } // Evaluate quantity q and shift the output slice on the host shiftedOnHost := func() *data.Slice { v := ValueOf(q) defer v.Free() return shiftSlice(v.HostCopy(), dx, dy, dz) } // Check if both approaches yield the same result if !slicesAreEqual(shiftedOnGpu(), shiftedOnHost()) { LogErr("Shifted(Quantity, dx, dy, dz) did not shift the input quantity correctly") os.Exit(1) } } // Shift slice values on the host over (dx, dy, dz) cellS func shiftSlice(input *data.Slice, dx, dy, dz int) *data.Slice { if !input.CPUAccess() { input = input.HostCopy() } size := input.Size() output := data.NewSlice(1, size) for x := 0; x < size[X]; x++ { for y := 0; y < size[Y]; y++ { for z := 0; z < size[Z]; z++ { val := 0.0 if x-dx >= 0 && x-dx < size[X] && y-dy >= 0 && y-dy < size[Y] && z-dz >= 0 && z-dz < size[Z] { val = input.Get(0, x-dx, y-dy, z-dz) } output.Set(0, x, y, z, val) } } } return output } // Return true if the values of two slices are equal to each other func slicesAreEqual(aSlice, bSlice *data.Slice) bool { size := aSlice.Size() ncomp := aSlice.NComp() if bSlice.NComp() != ncomp || bSlice.Size()[X] != size[X] || bSlice.Size()[Y] != size[Y] || bSlice.Size()[Z] != size[Z] { return false } if !aSlice.CPUAccess() { aSlice = aSlice.HostCopy() } if !bSlice.CPUAccess() { bSlice = bSlice.HostCopy() } for x := 0; x < size[X]; x++ { for y := 0; y < size[Y]; y++ { for z := 0; z < size[Z]; z++ { for c := 0; c < aSlice.NComp(); c++ { if aSlice.Get(c, x, y, z) != bSlice.Get(c, x, y, z) { return false } } } } } return true } // Implements a (scalar) Quantity which evaluates a function on the global mesh type FunctionQuantity struct { function func(data.Vector) float64 } func (q FunctionQuantity) NComp() int { return 1 } func (q FunctionQuantity) EvalTo(dst *data.Slice) { result := data.NewSlice(q.NComp(), dst.Size()) for x := 0; x < result.Size()[X]; x++ { for y := 0; y < result.Size()[Y]; y++ { for z := 0; z < result.Size()[Z]; z++ { r := Index2Coord(x, y, z) result.Set(0, x, y, z, q.function(r)) } } } data.Copy(dst, result) } 3-3.11.1/test/shiftgeom.mx3000066400000000000000000000007311503346766200153510ustar00rootroot00000000000000/* Test that shifting introduces the correct geometry from the sides. */ setgridsize(512, 64, 1) c := 2e-9 setcellsize(c, c, c) m = twodomain(0,0,1, 0,1,0, 0,0,-1) ext_centerwall(2) Msat = 1100e3 Aex = 16e-12 AnisU = vector(0, 0, 1) Ku1 = 1.27E6 alpha = 1 setgeom(circle(80*c).repeat(64*c,0,0)) ShiftMagL = vector(0,0,1) ShiftMagR = vector(0,0,-1) for i:=0; i<100; i++{ shift(1) } tol := 1e-5 expectv("m", m.average(), vector(0, 0.008466859, 0.38830321), tol) 3-3.11.1/test/shiftgeom.todo000066400000000000000000000021231503346766200156040ustar00rootroot00000000000000 Nx := 128 Ny := 64 Nz := 1 c := 3e-9 setgridsize(Nx, Ny, Nz) setcellsize(c, c, 10e-9) wx := Nx * c wy := Ny * c Msat = 860e3 Aex = 13e-12 Xi = 0.1 m = twodomain(1,0,0, 0,1,0, -1,0,0) notch := rect(25e-9, 25e-9).RotZ(45*pi/180).transl(0, wy/2, 0).inverse() setGeom(notch.Repeat(wx/2, 0, 0)) save(geom) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. ext_rmSurfaceCharge(0, 1, -1) // Relax Alpha = 3 // high damping for fast relax RunWhile(MaxTorque > 1e-3) // relax Alpha = 0.02 // restore normal damping // Set post-step function that centers simulation window on domain wall. ext_centerWall(0) // keep m[0] (= m_x) close to zero // Schedule output autosave(m, 100e-12) tableadd(ext_dwpos) // domain wall position tableadd(ext_dwspeed) // domain wall speed tableautosave(10e-12) // Run the simulation with current through the sample pol = 0.56 J = vector(-10e12, 0, 0) Run(0.4e-9) expectv("m", m.average(), vector(0.0256580703, 0.3362270342, 0.0306527231), 1e-5) 3-3.11.1/test/shiftgrains.mx3000066400000000000000000000010241503346766200157010ustar00rootroot00000000000000/* Test shift of voronoi cells: new ones should enter from the side. */ setgridsize(512, 64, 1) c := 4e-9 setcellsize(c, c, c) m = twodomain(1,0,0, 0,1,0, -1,0,0) Aex = 13e-12 Msat = 800e3 ext_makegrains(40e-9, 255, 0) ext_rmsurfacecharge(0, 1, -1) ext_centerwall(0) alpha = 1 for i:=0; i<255; i++{ Aex.SetRegion(i, 13e-12 + randNorm()*1.3e-12) Msat.SetRegion(i, 800e3 + randNorm()*80e3) } for i:=0; i< 56; i++{ shift(-1) } expect("aex", Aex.Average(), 1.298444e-11, 1e-16) expect("msat", MSat.Average(), 797898, 1) 3-3.11.1/test/slice.mx3000066400000000000000000000006171503346766200144660ustar00rootroot00000000000000/* Test basic slice operations. */ setgridsize(1, 1, 1) setcellsize(1, 1, 1) a := NewSlice(1, 10, 4, 2) a.set(0, 0, 0, 0, 0) a.set(0, 1, 0, 0, 100) a.set(0, 0, 1, 0, 10) a.set(0, 0, 0, 1, 1) a.set(0, 2, 3, 1, 231) print(a) expect("1,0,0", a.get(0, 1, 0, 0), 100, 0) expect("0,1,0", a.get(0, 0, 1, 0), 10, 0) expect("0,0,1", a.get(0, 0, 0, 1), 1, 0) expect("2,3,1", a.get(0, 2, 3, 1), 231, 0) 3-3.11.1/test/smoothdemag.mx3000066400000000000000000000016641503346766200157010ustar00rootroot00000000000000/* Test if smoothed geometry is handled correctly by demag calculation and Mfull. */ Nx := 100 Ny := 50 setgridsize(Nx, Ny, 2) c := 1e-9 setcellsize(c, c , c) // reference demag field for full slab msat = 1/mu0 m = uniform(1, 0, 0) expectv("B_demag", b_demag.average(), vector(-0.02648, 0, 0), 1e-3) expectv("M_full", m_full.average(), vector(1/mu0, 0, 0), 1) edgesmooth = 8 slab := cuboid(Nx*c, Ny*c, c) // Only one layer: should give about half the demag field setgeom(slab.transl(0, 0, -c/2)) m = uniform(1, 0, 0) expectv("B_demag", b_demag.average(), vector(-0.02648 / 2, 0, 0), 1e-3) expectv("M_full", m_full.average(), vector(0.5/mu0, 0, 0), 1) // Same magnet, but halfway between the two layers. // Without smoothed geometry, this fails miserably. setgeom(slab.transl(0, 0, 0)) m = uniform(1, 0, 0) expectv("B_demag", b_demag.average(), vector(-0.02648 / 2, 0, 0), 1e-3) expectv("M_full", m_full.average(), vector(0.5/mu0, 0, 0), 1) 3-3.11.1/test/smoothgeom.mx3000066400000000000000000000005701503346766200155460ustar00rootroot00000000000000/* Test smooth geometry by evaluating the surface of a circle. */ N := 10 setgridsize(2*N, N, 1) c := 1e-9 setcellsize(c, 2*c, 3*c) disk := circle(2*N*c) // No smoothing: rough approximation edgesmooth=0 setgeom(disk) expect("staircase", geom.Average(), 0.8, 1e-3) // Smoothing: good approximation edgesmooth=10 setgeom(disk) expect("smooth", geom.Average(), pi/4, 1e-3) 3-3.11.1/test/snapshot.mx3000066400000000000000000000003611503346766200152220ustar00rootroot00000000000000/* Test saving snapshots on-the-fly. */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) snapshot(m) snapshotformat = "png" snapshot(m) snapshotformat = "gif" snapshot(m) 3-3.11.1/test/source.todo000066400000000000000000000001601503346766200151160ustar00rootroot00000000000000/* Test source("inputfile"). */ source("sourcetest") // defines a variable expect("variable", variable, 2, 0) 3-3.11.1/test/sourcetest000066400000000000000000000000441503346766200150530ustar00rootroot00000000000000// read by source.txt variable := 2 3-3.11.1/test/sp4_angles.mx3000066400000000000000000000011671503346766200154270ustar00rootroot00000000000000/* Test angle output. */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 800e3 TOL := 1e-5 m = uniform(1, 0, 0) expect("ext_phi", ext_phi.average(), 0, TOL) expect("ext_theta", ext_theta.average(), pi/2.0, TOL) m = uniform(0, 1, 0) expect("ext_phi", ext_phi.average(), pi/2.0, TOL) expect("ext_theta", ext_theta.average(), pi/2.0, TOL) m = uniform(1, 1, 0) expect("ext_phi", ext_phi.average(), pi/4.0, TOL) expect("ext_theta", ext_theta.average(), pi/2.0, TOL) m = uniform(1, 0, 1) expect("ext_phi", ext_phi.average(), 0, TOL) expect("ext_theta", ext_theta.average(), pi/4.0, TOL)3-3.11.1/test/standardproblem4-3d-minimize.mx3000066400000000000000000000005331503346766200207540ustar00rootroot00000000000000/* Micromagnetic standard problem 4, with 3D discretization and minimize instead of relax */ // geometry setgridsize(128, 32, 2) setcellsize(500e-9/128, 125e-9/32, 3e-9/2) // material Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // minimize minimize() TOL := 1e-3 expectv("m", m.average(), vector(0.9669952, 0.12521563, 0), TOL) 3-3.11.1/test/standardproblem4-3d.mx3000066400000000000000000000007711503346766200171410ustar00rootroot00000000000000/* Micromagnetic standard problem 4, with 3D discretization */ // geometry setgridsize(128, 32, 2) setcellsize(500e-9/128, 125e-9/32, 3e-9/2) // material Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // relax relax() TOL := 1e-4 expectv("m", m.average(), vector(0.9669952392578125, 0.12521563470363617, 0), TOL) // reversal alpha = 0.02 B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9846296310424805, 0.1256464719772339, 0.04335508495569229), TOL) 3-3.11.1/test/standardproblem4.go000066400000000000000000000010321503346766200165220ustar00rootroot00000000000000//go:build ignore // +build ignore package main import ( . "github.com/mumax/3/engine" ) func main() { defer InitAndClose()() SetGridSize(128, 32, 1) SetCellSize(500e-9/128, 125e-9/32, 3e-9) Msat.Set(800e3) Aex.Set(13e-12) Alpha.Set(0.02) M.Set(Uniform(1, .1, 0)) AutoSave(&M, 100e-12) TableAdd(MaxTorque) TableAutoSave(5e-12) Relax() // reversal B_ext.Set(Vector(-24.6e-3, 4.3e-3, 0)) Run(1e-9) TOL := 1e-3 ExpectV("m", M.Average(), Vector(-0.9846124053001404, 0.12604089081287384, 0.04327124357223511), TOL) } 3-3.11.1/test/standardproblem4.mx3000066400000000000000000000012551503346766200166330ustar00rootroot00000000000000/* Micromagnetic standard problem 4 (a) according to http://www.ctcms.nist.gov/~rdm/mumag.org.html */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 1600e3 Aex = 13e-12 E_total.get() // try to trigger bad lex2 Msat = 800e3 alpha = 0.02 m = uniform(1, .1, 0) // relax relax() save(m) TOL := 1e-5 expectv("m", m.average(), vector(0.9669684171676636, 0.1252732127904892, 0), TOL) // reversal tableautosave(10e-12) autosave(m, 100e-12) autosnapshot(m, 50e-12) B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9846124053001404, 0.12604089081287384, 0.04327124357223511), TOL) 3-3.11.1/test/standardproblem4_rk56.mx3000066400000000000000000000013041503346766200174750ustar00rootroot00000000000000/* Micromagnetic standard problem 4 (a) according to http://www.ctcms.nist.gov/~rdm/mumag.org.html */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 1600e3 Aex = 13e-12 E_total.get() // try to trigger bad lex2 Msat = 800e3 alpha = 0.02 m = uniform(1, .1, 0) //solver setsolver(6) // relax relax() save(m) TOL := 1e-5 expectv("m", m.average(), vector(0.9669684171676636, 0.1252732127904892, 0), TOL) // reversal tableautosave(10e-12) autosave(m, 100e-12) autosnapshot(m, 50e-12) B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9846124053001404, 0.12604089081287384, 0.04327124357223511), TOL) 3-3.11.1/test/standardproblem4b.mx3000066400000000000000000000010531503346766200167710ustar00rootroot00000000000000/* Micromagnetic standard problem 4 (b) according to http://www.ctcms.nist.gov/~rdm/mumag.org.html */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) // relax relax() TOL := 1e-3 expectv("m", m.average(), vector(0.9669684171676636, 0.1252732276916504, 0), TOL) // reversal B_ext = vector(-35.5E-3, -6.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9692331552505493, -0.12039542198181152, -0.0053076110780239105), TOL) 3-3.11.1/test/standardproblem5.mx3000066400000000000000000000007221503346766200166320ustar00rootroot00000000000000/* Micromagnetic standard problem 5 as proposed by M. Najafi et al., JAP 105, 113914 (2009). Reference solution by mumax2. */ setgridsize(32, 32, 4) setcellsize(100e-9/32, 100e-9/32, 10e-9/4) Msat = 800e3 Aex = 13e-12 m = vortex(1, 1) alpha = 0.1 relax() xi = 0.05 J = vector(1e12, 0, 0) Pol = 1 run(1e-9) m_ := m.average() expect("mx", m_[0], -0.23479773, 1e-4) expect("my", m_[1], -0.09453578, 1e-4) expect("mz", m_[2], 0.02296375, 1e-4) 3-3.11.1/test/standardproblem5a.mx3000066400000000000000000000017771503346766200170060ustar00rootroot00000000000000/* Test for reversal driven by Slonczewski STT. */ Nx := 64 Ny := 32 Nz := 1 sX := 160e-9 sY := 80e-9 sZ := 5e-9 setgridsize(Nx, Ny, Nz) setcellsize(sX/Nx, sY/Ny, sZ/Nz) Msat = 800e3 Aex = 13e-12 alpha = 3 // Set a initial magnetisation to C-state m = uniform(1, 1, 0.001) minimize() alpha = 0.01 lambda = 1 Pol = 0.5669 epsilonprime = 0 pdeg := 1 prad := pdeg * pi / 180.0 px := cos(prad) py := sin(prad) fixedlayer = vector(px, py, 0) Jtot := -0.008 // total current in amps carea := sX * sY jc := Jtot / carea print("Current density is: " , jc) J = vector(0, 0, jc) //autosave(m, 1e-12) tableautosave(10e-12) save(sttorque) TOL := 1e-3 m0 := m.average() expectv("m", m0, vector(0.9586266279220581, 0.20391345024108887, 0), TOL) run(1e-9) m1 := m.average() expectv("m", m1, vector(0.6440994739532471, 0.5131782293319702, -0.1569230705499649), TOL) run(1e-9) m2 := m.average() expectv("m", m2, vector(-0.957406222820282, 0.20698121190071106, 0.009677470661699772), TOL) 3-3.11.1/test/std5b.mif000066400000000000000000000027251503346766200144560ustar00rootroot00000000000000# MIF 2.1 set pi [expr 4*atan(1.0)] set mu0 [expr 4*$pi*1e-7] set basename std5b Parameter total_current -0.006 ;# Current in amps Parameter Ms 800e3 Parameter A 13e-12 Parameter Polarization 0.5669 Parameter Lambda 2.0 Parameter eps_prime 1.0 Parameter mp_theta 20.0 ;# Direction of mp, in degrees set mp_theta [expr {$mp_theta*$pi/180.}] set length 160e-9 set width 80e-9 set thick 5e-9 set Nx 64 set Ny 32 set Nz 1 set current_area [expr {$length*$width}] set current_density [expr {$total_current/$current_area}] set xcellsize [expr {$length/$Nx}] set ycellsize [expr {$width/$Ny}] set zcellsize [expr {$thick/$Nz}] Specify Oxs_BoxAtlas:atlas [subst { xrange {0 $length} yrange {0 $width} zrange {0 $thick} }] Specify Oxs_RectangularMesh:mesh [subst { cellsize {$xcellsize $ycellsize $zcellsize} atlas :atlas }] # Exchange Specify Oxs_UniformExchange [subst { A $A }] # Demag Specify Oxs_Demag {} # Evolver Specify Oxs_SpinXferEvolve:evolve [subst { alpha 0.01 start_dm 0.00001 min_timestep 1e-18 max_timestep 1e-12 mp {[expr {cos($mp_theta)}] [expr {sin($mp_theta)}] 0} J $current_density P $Polarization Lambda $Lambda eps_prime $eps_prime }] # Driver Specify Oxs_TimeDriver [subst { basename [list ${basename}-eprime1] evolver :evolve stopping_time 1e-9 stage_count 1 mesh :mesh Ms $Ms m0 {1.0 0.1 0.0} }] Destination archive mmArchive # Schedule Oxs_TimeDriver::Magnetization archive Stage 1 Schedule DataTable archive Stage 1 3-3.11.1/test/std5b.mx3000066400000000000000000000026221503346766200144060ustar00rootroot00000000000000/* Test for Slonczewski STT with nonzero epsilonprime. Standard solution was verified against oommf 2.0a0 */ setsolver(5) DemagAccuracy = 29 total_current := -0.006 // Current in amps Msat = 800e3 Aex = 13e-12 Pol = 0.5669 Lambda = 2 EpsilonPrime = 1.0 gammaLL = 2.211e5 / mu0 mp_theta := pi * 20.0 / 180 // Direction of mp length := 160e-9 width := 80e-9 thick := 5e-9 Nx := 64 Ny := 32 Nz := 1 current_area := length * width current_density := total_current / current_area J = vector(0, 0, current_density) SetGridSize(Nx, Ny, Nz) SetCellSize(length/Nx, width/Ny, thick/Nz) alpha = 0.01 FixedLayer = vector(cos(mp_theta), sin(mp_theta), 0) m = uniform(1, .1, 0) minDt = 1e-18 maxDt = 1e-12 maxErr = 1e-6 run(1e-9) save(m) TOL := 1e-5 print(m.average()) // # ODT 1.0 // ## Desc: Data from vector field file std5b-eprime1-Oxs_TimeDriver-Spin-00-0002233.omf // ## Active volume: (0,0,0) x (1.6e-07,8e-08,5e-09) // ## Cell size: 2.5e-09 x 2.5e-09 x 5e-09 // ## Cells in active volume: 2048 // # // # Table Start // # Title: Average across active volume // # Columns:\ // # m_x m_y m_z // # Units:\ // # {} {} {} // -0.953323544827031 -0.279948071263437 0.00528093273512820 // # Table End m1 := -0.953323544827031 m2 := -0.279948071263437 m3 := 0.00528093273512820 expectv("m", m.average(), vector(m1, m2, m3), TOL) 3-3.11.1/test/std5c.mx3000066400000000000000000000013761503346766200144140ustar00rootroot00000000000000/* Test for Slonczewski STT with zero epsilonprime. Standard solution was verified against oommf 1.2a5bis */ total_current := -0.006 // Current in amps Msat = 800e3 Aex = 13e-12 Pol = 0.5669 Lambda = 2 EpsilonPrime = 0 mp_theta := pi*20/180 // Direction of mp length := 160e-9 width := 80e-9 thick := 5e-9 Nx := 64 Ny := 32 Nz := 1 current_area := length*width current_density := total_current/current_area J = vector(0, 0, current_density) SetGridSize(Nx, Ny, Nz) SetCellSize(length/Nx, width/Ny, thick/Nz) alpha = 0.01 FixedLayer = vector(cos(mp_theta), sin(mp_theta), 0) m = uniform(1,0,0) tableautosave(10e-12) run(0.5e-9) TOL := 1e-5 expectv("m", m.average(), vector(0.905612587928772, -0.2860856354236603, -0.011005667969584465), TOL) 3-3.11.1/test/std5c3d.mx3000066400000000000000000000014761503346766200146440ustar00rootroot00000000000000/* Test for Slonczewski STT with 3D discretization Standard solution was verified against oommf 1.2a5bis with 2D discretization */ total_current := -0.006 // Current in amps Msat = 800e3 Aex = 13e-12 Pol = 0.5669 Lambda = 2 EpsilonPrime = 0 mp_theta := pi*20/180 // Direction of mp length := 160e-9 width := 80e-9 thick := 5e-9 Nx := 64 Ny := 32 Nz := 2 current_area := length*width current_density := total_current/current_area J = vector(0, 0, current_density) SetGridSize(Nx, Ny, Nz) SetCellSize(length/Nx, width/Ny, thick/Nz) alpha = 0.01 FixedLayer = vector(cos(mp_theta), sin(mp_theta), 0) m = uniform(1,0,0) tableautosave(10e-12) run(0.5e-9) TOL := 1e-2 // a small 2D/3D difference is acceptable expectv("m", m.average(), vector(0.905612587928772, -0.2860856354236603, -0.011005667969584465), TOL) 3-3.11.1/test/steppercache.mx3000066400000000000000000000012361503346766200160330ustar00rootroot00000000000000/* Test if the stepper cache buffers are flushed when starting a new run */ setgridsize(1,1,1) setcellsize(1,1,1) msat = 1000e3 alpha = 0.1 B_ext = vector(0, 0, 0.05) setsolver(5) // This steppers uses a cache buffer fixdt = 1e-12 m = uniform(1,0,0) steps(1) rotation_wanted := acos( vector(1,0,0).dot(m.average()) ) m = uniform(-1,0,0) steps(1) rotation := acos( vector(-1,0,0).dot(m.average()) ) // Note that the rotation angle should be the same for the two cases for symmetry reasons. // However, if the cache buffers are not removed, this will lead to an erroneous result in // the second case. expect("m rotation angle", rotation, rotation_wanted, 1e-5)3-3.11.1/test/table.mx3000066400000000000000000000002521503346766200144510ustar00rootroot00000000000000/* Test adding user-defined variable to table. */ setmesh(2, 1, 1, 1, 1, 1, 0, 0, 0) f := 0 tableAddVar(f, "f", "Hz") tableSave() f = 1 tableSave() f = 2 tableSave() 3-3.11.1/test/tempminimize.mx3000066400000000000000000000003651503346766200160760ustar00rootroot00000000000000// test minimize with finite temperature setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) temp = 300 minimize() expect("T", temp.Average(), 300, 0) 3-3.11.1/test/temprelax.mx3000066400000000000000000000003571503346766200153710ustar00rootroot00000000000000// test relax with finite temperature setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) temp = 300 relax() expect("T", temp.Average(), 300, 0) 3-3.11.1/test/testdata/000077500000000000000000000000001503346766200145435ustar00rootroot000000000000003-3.11.1/test/testdata/binary4.dump000066400000000000000000000726501503346766200170140ustar00rootroot00000000000000#dump002ds4$'PʾM^ :q_mSpLIH(J]MxQTXUamzHCϾNc_A@Rܻ3 ;Mʾ8)>u쑑w⍾ ی9Sre ľ]I 53e:z=~` 6H[09zؾ:Ǿ驻 |ɱi7(OžEо)ᾲ`#6+m)W~ 澜۾־jRվ.׾(۾O`=zjB,RH}dfvy}}t>lɼZHG4i"T<Pe9Yr !2VHkG`Zr~{~n_C@MĔ;`+9C%2 A"-@xCfLNWVc%(oN x/6})huYYXmIntjQaZR1VUkKY_͢gApnw]|z.yh#~o~~~~&!j2ysomn)r0v~i}|eE|{b{L!{z(zV{E{({{'I|8|2}}G.~~i"b4YHz~~9~q}}|$'|]{{xw{)q{5{{{4||~|zc}z}K~Ŀ~/; ~(~X~}})}|y|y|4f|f|z|||}j}}!~R~~HПeݶwI[~^v~1~} }S}{}4n}p}~}}<}}=~~t~jiS <ˠzP$C~9~~~ki~W~ P~*R~g^~nt~~ٺ~~S2 ԦosXt?*(>~_~~&0B{bL|z+KttkfFdflwu[$MQ;ɮ5ɭ.L#~],c-[  ?m68ZRu8F/191?|.#-nk%vDK!Jo4((@iLgR)u+iz|W/7{U \.F'/f}m2O_q}KItdnb],PD0mCeE1*5VUyV0 xtsgG'@u[A( /blWC1" 4S~{j[MB;9<FXr{oe]YX\fvytrsxsoot|ycWSU]iwZ@3/4?Obwa2+D_z8;^blht1]3O@t"XEc1"0VmIb%j(o*t X 4VxTHf![s[L9/q}{#rb6Y%Cb9>nA.@5+Q*g5{*@/X/Q[:"1RK!:Aگ~,v7/n$\ߙƔ8G #XmW$Wzw;wzŧe')70%ϫ {pjhkr~76*>" qyr qtL~Hlv,[xv=[u? Pmw`:AL};u+~e}|m|$|u1|?|s<}~4~Yud.o>~1}{ywmvȐuZqu5v2(w2xzr|,-~l+94}{ xzso9lCj2jHkTmRbo0rb4uzjՔo s wzt}y9}wpay;vA +zp L]Q ='29=0FG}PX L`fPl}qux(a{}}%vU?{ܾھ羜U8 =*3&3?RJS#\6@c6JibnrVFv0Oy3{}vJ˝kE|D;ȅSn{>(h<7CN"Xf_υf|kpKtntwzW|j%~?M/Nh=dh4ޔ༾s|-'$=;Ia%zQ !) "4Cs:PTZa=hKmaq|ttw[yG{I}i~[?>h>߁=W<;Ns"N:+=-XL›W0U`UgllRp svxze|*}~y?\[??>w>w!>^0<_&h6|HͬUp_fDlpMs{vx`z{=}i*~  ;~?w?\??>>ՆŽOӰ :-WDT(_+xg!m:Cq@ktFvxcz{=|*}~I@?}?8w?N/_?%?:e>b8þ(y@ARUFaiVo#s9uw)yDz{H|R}P~I~m?.?ND~?Fx?Td?z(?p9L>YP=Ytg"osY~vh~xy{l{D|U}}^~Q~8?v?v?d?~?q?V ?e`>DIcYryz}.g9XFK}?!z?Ɵt? k?W?I,?>Ba6I6PdAqIy1}~EXnpK|?KSw?do?)a?]lD?O?)%>P־@(ٚ?bTeJ:r%}yz'}z~3.dz?t?i? U?*,?>=)\F#.CXUi6t z}%D3y?/q?#c?`E?pK ?x> >AoIc<}񵾰Ͼa, C)6{Oftb{(:~~Z-v?g?eE?z?~v>mn= ~@4q|e2yN KѾ1 p "!>Zoy,}L]s?t [?m ?>N >:V<6׽S"Q'zO}K Ը>Ⱦ۾J.(?)*LixĿ}^Gk??;?Gz>9>d"=V;eAmh+1Kfd~Љ⓾S`f׹ΧϾOF|2:Ѹb^we}T?#>ցY>G==ل"{94L6Խ/fyu2qDŘT5cӆqrs*Ŧm¾3e"ZwQ?Zg>U=RB9=9T-R>d 6 ؾ!+z?yN YA`d;g'qhȸhKh;g}zeb^AX|kN=db"쉾 yؼz8žٰ 6;E6 QaXXe\^_E_]ZդVQPF7Yp줾K=uC |U;B).5.='uHORSSѽPڋL!FK!yd< <.б<>ᆭ846&W4j=~[BCB?FT8w .r ۾ɜΉBʽ@;C;=H=0=i =#4x Z&O,B-0+$!| ͩ꾻\}axT@W=Ӡ<P= ==e==^j=q1=1LW˾H I4 B Y|پxư򄁾s$`/j>>i>۵=v4===G>8>15&>UC)>x(>R%>H >jv>V>>=S=2K=#?m=Y\;j=.G=W= 3=E>J!>/>8> >>5tA>AA>4?>:>,4>+>{!>8>M>=tW=E=+s=<-&=)=kQ=C=_ >>#.>g9>+A>F>lI>I>G>?C><>>6>->5#>[>V >==E=$2s=%<ʏR=$R=_!==e >'>&>0>@8>3=>?>^@>>>;>eo6>/>J'>]>>>=`6=+=k=<4==ۥ===P={ >g>>z>t%>V)>$+>+>+7)>%>c >>o>>6=!=:=8=]=<=etA=Y[z==== !==>*B >rN>L>>9>Q >ka>==[==(==;I=m=ѥQ=Rc=3s=J=iÅ=?Љ=M===_= =Wy=qe=rK=)=J;z ];$!{;;;M;uA;O;;;P;+b; ;;;ʜ;&s;r'8xv9C:<\:&b::;)+;H:;PV;p;8;hO;u;xs; ;[;;q;,;s;i;t;S;];M2929&-:?::pU:J;,;7;]P;~g;};p;;;¢;zq;.Q;9 ;;;9P;;;9;G; H9 :U:>:":l:ֶ ;;o5;J;B_;Yr;F;*g;샑;n;F;DԞ;ǟ;Rq;U;dǒ;;i;.4;_!9Y'3:p|:1::b:} ;!;n-4;HF;.X;&h;1w;;BW;;l;Y;;;g; ;ws;KQ;!;::Z:쭒:l: S:>;,;.#;3;ӅC;}PR;g_;9Al;w;;{;υ;IІ;F;0;:~;fwp;#[;><;;X3:~:{:<:4:};>;<&;`4;A;GM;X;Hb;mj;yp;u;x;Nx;u;p;af;uX;ND;|';_z;V:䬖:B::ڐ;;p;L*;6;@;_J;FR;Y;U_;bc;Ke;Q7f;Yd;Ҙ`;Y;oO;tA;.; ;iR:U{:F::d:x ;;1$;q/;}8;.IA;VH;4N;R;U;NW;cW;4U;}[R;yL; E;z:;V,;e;WT;':ΐ:ȿ:!a:';v9;{";z,;x5;A=;C;G;J;&L;aM;L;4jJ;(F;sA;|o:; y1;C&;X;;M:6::D:|;;4";X-;ad6;`H=;tB;F;|0H;EH;RH;NiF;5C;K>;8;M1;%);; ;;$:F:FA:m:)q:;~ #;:0;89;%A;o"F;U)I;nJ;,J;?QH;,(E;@; ;;34;8,;(#;5;l ;_;:::!h:5\:H ;p.!;v2;>;G; PM; iP;IQ;6P;hM;m I;IC;?<;(4;*; ;p;Yu ;O: ::͡:s:4:-:|;1;yQC;HO;V; [;@\;Z;W; )R;+K;B;9;8.;v";f;;:o:ݻ::9:A:X:ep; );8C;U;fa;g;j;i;Yf;Lr`;3X; N;}C;7;);0;5( ;oS::X:1:w:;:NB:p9;(<;;W;Hdj;ڗu;({;(5|;y;Ys;3Nk;`;T;E;›6;5&;; k;:W:A5:9t:K4:9kۋ9x8V!;meO;n;;';'r; 1;م;Ł;%{x;j;kw[;GMJ;7;0;=;.;%j;&L;O.;0;>:&V:+[:;9p8xz|(P'"8{;9-;<;;;G;;$;c];_;;(;!;^;L<; L;::N:69h8À]U=;;g;;y;;;S;2f;p;];x;v;Ѹz;=S;g-+;e>;ua:Q[:9ƃ0mbt_3~ϡ;1;p;=D=$H=J=nI=ZF= A=M9=2\/=O#=G=Qo=m<<=v=}=s=Ne=xT= @=(===-=s==ub=߀=Qr=\^=F=*=: =<֫gT,<&R =ڕ(=fe;=L4F=H=y4B=/m1== <*E<»u[ ZAܽ:|Wm*I1/@& 귽o)f;.ɾms]㥾dKLz_G" H%A E޾9zƾX \.zhZr,mp)!~rvz1|%{RytmqAdCXJ;y,*JgO辖о'=(i4tvK 彇|Y |?xpDdUE+5$Yh׾YۉZ,a};=]Խt jd8JyE~a3z/qmb<QO>+4 L3ܾv[ľxᣈnnTL-+½_bNcTr|4`|EqO^sIgm3n0- ݾžaY꛾NtpUj7I^1IIn}x~?pGW}B$p³l I>,(m оش=X~%Kch(K4 ЀPW,iH WLpև@jz-`l+AپCŶh;8&v1](G4b#<]p*Ľ狟?[iYT<\<;X+Ek_GY-BϾٕ tnT?&0|#yzN]%ν |-A===%>P>o>=?,?h>8=͓ _>֖>Ӂ>ơ?_?a?;#?>u>S>>ANj=<BU:򎔼4 3:p;>Q9f-˨D~>[>Y>>a ?+Y=?t?Pu?S?g3?=8?@{>R?>gim>>=4[= <ީ; ͼPż옢QL2>e'>6g>`>)-$?0aX?|?z?rh?T?1@?k)??_}>٫>ze>B>=nh=Z$>>m ?*u=?Pk?tI?#v|?bq?e?VY?K?F:?%%?6 ?s>ڕ>fJ>D>?=hI=gr,2`>>*>#?3V?Tw??I}?.v?o o?1g?^?FU?I?8??p>>Y{>#>X===6>O?q>?k?P}??}?y?t?nJo?^i?mUd?]?1T?G?3?g?s>ԣ>A>=vu= '='>#?@\?xx?N??4~?o{?Qx?Dt?Rq?Nm?Яi?e?^?OU?E?#m+?Ý?>\>>m=`$=۞>p\?G?r?}???~?Q|?lz?]>x?u?s?Jq?n?ٛk?Gg?m`?T?{???Q> t>V~>=c>-? j?{?]???~?}?g|?&z?"my?uw?7v?u?ss?iq?vn?i?Qa?1Q?^/?>q><>e~?`?(z?c~????vG?~?}?f|?|?j<{?lz?ky?x?w?v?t? r?\l?a?;E?O?/>~Z?say?2~????o?`?I?~?}~?~?}?D}?L|?5|?a|?~{?z?z?x?7u?xn?6Y? ??"ޮV.$K/յ/@0W-0Y=0>D0}E0D0 C0VA0Q?0>0;d>0>0H@0.I0-c0j201}1r2/ 2և./P0W909Z0k04r0(s0 q0op0Xgo0Wo0p0t0y~0~0#0v0,1h1}[0idEb*h P//>0z0P020b00ś00h0BѠ0;ȧ0ʗ00O0 )1P0H1 1f0Kl/䂯 f^;p!-|/W)00}[0˱00|0r0000-02810i1ʆ@1k1o`1i 1t00跴/vK,/ 00t000X0x0.090t0 1-1,715`11罍1+\1e1Ð0'10ϐ/f7/\ %I./Nf0000 1˞11l+(13>1^1q11O1h1|=1Th0 Y0 80%/ /yk/a_.b/k0Ƿ0܊0y1H%1~81ܒN1 qk131@1Ӹ1M1s[1j1!1˴0p0J*0 //ҳ/j`&/]/+00|0516:1o/\1L71Go1प11> 1 1 o1gy1F4170'0`60\0/ĝ/Աk//ٰ.)0x,0;\1O1ot1)11N1t1111o,W1cZ1i0P0 0i//0Y/**/.:.5.Ue/ 1A1t 141L11@1ϛ1y1&M1:0v0 0pa/]Y2.CLxͮǮîtՂg姮ҮX111OY1bc1~L11=80g$0Y-ǯ!oDOLV@̟2> ]d{ޯ5J抯kr7dx/_y:و-0߰.Lh򰻾簷ذɰ`=VmP,3rίy@ְH9B.8<=(:X5s-dG%^JްhȰ uᇰh? GutΰY|j1BNRU^YXzVQ+PI?]5M) 6p-<찏Ұ.xBޏM{/o^L #96]FQJ-[^ag_\XU.NDC8*) TE}°ެ(~wVܭȰԼ5l =M2z2>x)KYQxY}Z\X/fUrL|E8\-!f$++mZݰ0Ctph|}װ <&ɯ3:{DgG LejKK&FUA7.|h!lV\sc)°2r1Y°ᰶFOq!+.4a4T7!53Ϻ1)#?IEZ욾?)N+sΰ]{ݰOe  7qi1YX't6iϰ"㯰lu7蔰5ҩ5$̰MUX,bwpb S!{ްoɰ[Ѕ{\x)_0}ő*6x°@#Ӱ\հgi(yI$hQ< 氅yڰְ5+Ȱ߾ MpaG0S^arA̰>Ͱװq԰4UܰT]ְ۰Ӱiְk̰˰%Q׻P-',fl*G3w273C1ɰ˰0Ұ_аJ԰d ϰ|а ɰǰx鰰܇vU>4Ս#*h窰'İ˰mаAfҰWdӰqѰϰқ˰'zǰk۲y誰_Bu:\HfvHe(`*𲰓Ͱzmа]ٰհYFܰӰowװ#D̰-ͰEҏZg!X$j}*_~D#W)Vۜ\ǰ ڰdװq!ٰܰX#cа`ذdSðǰ>-ˈH^F#5ݫɰ!ϰLa߰U*Bzhְ]ȰհLѢѤ腋HSbMÝ%{S^o;ذٰ)X`d2=/Q ް$Wqϰ氄=|Ͱt맰VTUhoX%D*!Ի8쾰H'zk&gV 氦hְðsް+pOe30M(0]ȰOɰrm2U7.!U>)?ް'q)ʰRܳ%&Ͱ㙰Cg3y@sT600հ?հƑrgK+$Cn/+ t"|q?Ѱb#ݰ^Qű@, fA>9&䰝ᰓ+6],M ~:J >zR;L`0rX!nذa s%b.FCEﰒ() Xi:}i{1M#nݢ 0Eް`:&JְeC-MH$>1Xs\ҪWwXKիL >K0o.0uP} 1QF"Kn}c׍-soW6LѯhoЧnig*,<[m*FK :fS'h ^C+xȰŕ@ؔr/g4\ܷ&z{W ±yaDZ%l±_ [3lRGtv2z3Y(.jS4mӰI8A4vyǛnK쇱SBh抱α↱YӱM~ӪͱckxVT+>gm2%u+ <Ɛݰ뫰 KGpӅ˱+CD۱՚Pޱ/btرe|ʱ*dJJ;o/F\ ~DxǏ˲5!رGIOK]J;ݮұ_sfVG顱S8e!iJ"+} ꯰7J$ñ; )bM|R'eFԝvA책Kڱ6y±(bզ/1A) OPLjUGFPѱLU^B Yxo;dSi.uErݘ3 %;ϲ%Iͱޱ6=賄9DA>HBl G)fB%j3sloق(6cqwlQXkc*^ۺzh(B\*,Gɲ#]> `<WP\3`  T߲&pӲG)+u@^(f9,j>)GؽS5Vl^[_zmXZXzMyI;42$|P4~"в킯-,kO )dK'`&]J߳ón̏vn<S*Ef$;޲94564+OOh"e>7J9`h @ ,# ֳCѢF gN8Ϫ'߳Nsw 1>Pۯn0,ix>;! U-s_CسIڻ=g"7Sϫ]貳ʳٳϳꨘ[ԝ5Ǵ \@гH 0JZJg޳Pog7h6}\|ZN~ QɔR.lf:ƧkĴlkڴ~ִ3&녴kDBų ǑNh7T2=:aWLH=}^ݳ<dP2^O1s񅒴.$}@r ,zسIԏj#c8C^L;XͻKٯճ6 jF5rY LC>³䖳9co/Q97%ܘGcىo c:`̽>ѳ ᳤w5!&QGy:39Ksmzqd;[AQ;׽@г ϹZ es{}쭳(Mѳضﳿ!:1Higp/1(ٳ#AF]]{^ݳˆHl5J>UWoVP Òa/!ҳP90̞O=V$VBX2)@NɳSٳX.~;"6^?,x>6NEY<`wdh{mJvrdz<%& Ҍ&ﴳM:γne\ܧ! 08(L:;V{%*:-./1m>5<4_LAoO񳸞y(0<\Qb @߲1ӲYϲ8ԲqCZ P " )$ u0 ՜ Cu})KZ&ʳj\}OHe[3-3.11.1/test/testdata/binary4.ovf000066400000000000000000000740251503346766200166370ustar00rootroot00000000000000# OOMMF: rectangular mesh v1.0 # Segment count: 1 # Begin: Segment # Begin: Header # Title: /home/arne/wd/les/ingenieursproject/oommf/test1 # Desc: Field Index: 0 # Desc: Applied field (T): 0 0 0 # Desc: Iteration: 1623 # Desc: Time (s): 9.184200135885099e-9 # Desc: |m x h|: 9.6355219620542264e-06 # Desc: User Comment: # meshtype: rectangular # meshunit: m # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xnodes: 100 # ynodes: 25 # znodes: 1 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 2e-06 # ymax: 5e-07 # zmax: 2e-08 # valueunit: A/m # valuemultiplier: 800000 # ValueRangeMinMag: 1e-08 # ValueRangeMaxMag: 1 # End: Header # Begin: Data Binary 4 I84ǒ5B??R@A 2r`~=ydigǾ؈ l>8zvſnB\ɿparQ#/]t侕L10)y2j1 /}駾71X/x%<@4<%t <԰^oe<踰ܠT|8" ?[ t7&?yb_4.?~; ^IW?Wٓ?HiZS?~?|>2LQ}6?zd>JA=2?y>`2,^?x{>vN%7?v>B?s]>ۚ?kG>c@)?T?~e?Q?Z~<0($S9"3ܿ6KH6 E׿S!^f ZɿK_) b=y>._e40+jʾW0,xs\1A޿|Q11Y_=&-րY=Rʰ=4Lۙ=Ø$:L6:F\4:ÿȱnǿ:Dҿ:q); H$J;|ұ¿;) ڪ;;<(4ؿ;Oemo;fñ;S Aa;$8;-9G l;vp;Λ1E;&ɯ< ˳˷<#le) fdj7?[\N6̷?wՀI15 qu?}e^??.W -I?v<\$fE?~=Ŵ?|>$+sN?z!>[]?wSK>'e|\?t>:޴?q/>;L?nD>6?g>³;Q?[ t?\p]?;??-ʍN>#?`:M>gZ?yasQ\'ҿb[4 Ls /ÿ0[9*b>j~+!p;GH ٮпM@C..I%R!8/bW x0aQʾ H1\;oᄈ1 t|F!1YO;\yοmZ=)9HH=R$|Y~= =AteԿ = ׷}<0YzhοyJ85'?Tc6E?\6{ĉ?w8 5+_M?~DN2h?v;?~&=ݝ-?yY>_ w?t>Y?od>g6Z?i>Wa?c#>*X?X$?OG?Ee?#? m?G֯{]>zG?j S>Y?z(\en=U?~2 bPk Zӿ29Krh)m?z-Ͽ46/,;Ŀ.5/ҵB;&640kKRѡ0|ZI1OͿm14} 1cb=j:ӿ~i=QkB~z=!_1jֿ(=ێ#w=z[Y5<=38?Ϳ <ژ۱/:1;:lH:˧5ڿ;j%B;ܐd;f<ƒݿc<]b>b<,,,9w>r52Y?I6˫?_/Nˏ65u?xFpL5?d+X;?}M>%OO>?t>1 ?k >?a)>`~N?U ? mL?E`?#޳?+@?>qc?z?\@>?r^>9?{X=G?~c!=9BR?Mp".V;ʿdg.zX/P WNy/|ȿ"iſE;0 +`=0fN54W0Bѿ&Z15V1R 41ton1L~ )1L~=G.ٿ~> =C.ݿ~9~=Bڿ~Ĝ=Ȯ=69߿I= l=\ ќ=F쉱) ;OHݱ;afڿ;uڲ s;'QۿW;̲"b3;=7^2;lUL;3l;yw*<_1< Ń<3`ҳ>jX=W^ _ =e-=i6 |,=`ۆ3 mÿ:=F4L3[=)44B=<$4 N4NETw5F{t:s5< k+6;y.R6{_6$s 5&=hN{%5(9 4Hy!w|Ȁ5Dq>οn58n?%B>6gz?dT6?~޽E49?z >P鴇?ig>ЁӴP>?Wϱ? aϻ?Dl]?$-) ?,*?=u*H? Kp?V3>k?k>v~?xx@> N?}˓="d?];~.=?0 <9?Dzߒ@ ^tsR/K$ʿMk/񠱿a/Y 0)WTQ 60t09Hu'0,6=j0ܿ=!,O1:6U B1)r)Ѿ 1ep1~H=W0-}i> _α8ѿ}q> eNʉ~X=PF]=2M=z=ſ=84:޿M˿;}\s;n; ;"{h;-X+;98IG;GMb;V˱m̿;g2տ;{(߲o;r'MS;둲 ȿ/;|!a;ղB^;Gsաh;䔦ɿ<=ʲ"<F`ҿ<60G)<\(P!}51:2>e:p$5ݛ?(z@6X.?qؾk5Ñ?rt>o?N)?ƴ?,I?=Y+?O?Xa0Q>?kP=>x?wT> ?}P=nm?N ~`A0<٘}>'۱UR}> {Q~[=ʧ>2zP=&<=UO=O<ʄx<.\<5\>8;{Ա.;Km;h];0;̒;wd:`&:7:&@+̿:Ա:~:LJ<;=F[ -;lڿ;JH; ֱ̿; }l;,h;>ҝ;pⱶ;$1­;,z;6daDC;A%O;MP |;[ F;j;|5(o;1 [U;*%O4;:TD;X+Y ;񁲅lt;˿<wrFD0<ǎ<5nSؿ <\γBNY\H\=\ Qax=u-{=f۳Vpy=)γ=r532L=B4y480.L9pzj5? VؿG_6?M?=h>>?_n>?t[>%)?|=?It^}<?󦳫KӼoA?o ?زZϽ6?5lAe?"49{?b?oYqxv0@)r09W:ؿk0zؾ d0[}\e0οR0$C1%H>W-+017L_|1NƿzE.|Ee>.#=|>&Y^}>g[-J~v^=! K)x$=q3ɿ=qǛ=)k e$Vr=\t=qI3PD}=1m/4q<]4{-ɛ&ك5\n}l~*5;5ru6q@sRZ6߉(,@6׿z+XL5ÿd:4 z׻jfP4m߾m3됾 Qp^cA"sq/cؾ&qEyOp?sXοl´q(Y`- <`e-Y걼\o-?,"?a0aB?uPP?z.RҾ)?|v#<P?}I鳯I?}賋c ?~4e ?~>Jm?~^6L?Gv(bq?`8_y0-Wu>tc%0Z9n/0P۲g;0˹^0X%Sˍ1 &B18~C嗿$1oGg;1t}1yM=-Y|>!JL{>9g:(|'$>0ۦX})>^Ҋ~1=ѱQY~C=吱:so=6!q+=>bޱ ݂=lƿ;BO;;>`7`z;:HsUs;7뱃z;5o;4-n߿;3;4`Xw;6*;8}a;=AY;BtP;I)UJ];QIe';Zʲ&#;fYc;sY ,|;žʿi;7 O;4;+; 1Sd;]c;f2^T@;< #j<+e^lh۵>eD?#;i,?S5޾օ?hrfl?qbw?v.՛cڋ?y`:@~?{o{s"S?|QUh?},?Գ6?~:L?ICqSmzv0=Yuo]0kow02ľ։hq'0eP_0x͒S1˿*r?1NܿLf!1pA澰x1¿ͽl[01M&~!=Ư|:>/ϰh{b>A+5X{]>8@Vzʿ|>za}>Hc;AK;@ɱ[<,;AI.l%;C~M;Fʱ/;Jn;;P6F;Wܿ;`rL;kN3";x{%ճۿw;Ob;] /(D;0>.i;_c6(;p²t;JS<#<"[;YA7ta<~ߐ=8J7cI>>x6I6?3g ſ@?Tָ:\?e˸2PdFG?o o6}xf/?}ф;$?~Lpz 0D>wuU0r4 ڿp0bRjh0|9_E0.2P11ؿ8TF1kq WNٿ |1àwn197n0:}>5l!{>8ӿ{!L>Fȶ-s{>=3P+Q|y>%t_g} >Z}~=w]Gg?t=2.տ=cR7=!]ڿ]<dy(<۰]T/bDۿk=~L=28v=@4e};{MCƿ|J[U"Eʖ+:>O<63mgDW#L* UR mH}gtxt Y޿<͵ئrY>u!;P?8=(?@1ƯF?YV̴O^H?g1 ؾ?oJnѥ>2e?tDz'?x>]PVK1?z&N6y?|fV ?~}Iz0E} v+0s(|p0.hK0rY]09 L1(+l;K. w13cV1 >|]$s1Z=:40v}T]>GDo{>> 羻z>Il%Gd{wx>?DZI|y>)V\ſ}S> B*\~=[L (*=4=s3qQ=.N,U,;wC;LV;0Gٿg; dlu;8˱4;}ɱ;rY7;h&'K;_g1;X> L;RFKF;N4[ _;Jkcſ;HE|e;HQ?;I m;K+;L;N ;T2;[wkVѿ;ew)r;sz;.r^;(Ql1;x,;'&m;95\P< Xmz<2|s`$5!x }I6NfܿeC/96\}LW56817"5-;<4!'=Aj3+&3BF(@O$I:-Ի;մ45+󴧚=ÿ+ളH|0nZTL=aF(9o"CznATnĴD9Ͽzy>Sd>{@??)kk#?Ks17N?^Ƚj?i^ Ny?qR좾O?u餳 f?ym"YE׾2u?|%{ ?~H罿{H0D쾌v 0q o'0Ś۶(g;0䑿Z0t"AF!1>3H1@o(%1Ң .z`1W,o2=0 ȿ|>8O{c>At5صz(>I{q)>@^?|f4>+$UX}{>NrX~ik=Kje>=G4aK=J1i;=::ibܷ=dhZഷ qA>?R@\Tb?ힴkl.?:F p?UF5F烬?dUm ?mNM(K}?s~d?wu`&51L{b>AAɽ{V>G{5>>5]|f>+N.}n4>LUf/~W={K쉿=*07Tt=i=Dw,=ymige>}_~C?%%.p3?IYr?]wK?iг ?qJû?v7dwT?zl-:-?}Dѳ PM]zϰ0AVž9ulZ0ogX(7mo0h`Ob0ĿPQ1-)CU@V{^>?4Ұ`{E>C?{>;)M|z>)7+CD}p>Lr~P =F&~=Ա35!kt=?Yڿ=K~=߳ |z#|9YG˴@ <U}1>㴅Hr:J>ٴ&3X? 6봽1*?8,a?T1!5䅸?eц?n澓?u˳hc5?yk.?|L "Qxz#0?Qt10oWO߿l:0B^Ѯ0-!F17,N5<1s1[sh)Tx14Fh=ye0P}ޜ>2LC|}T>(x2̿{>:=ؿ{(>><^{>6oe |>%8}~>9E|~R*=-.AU~_=41Ͽf=M5=Qp=xk<ːՓ;>KJ޿;43Vf;*b(;"vm3ѿ;0x;ޱU;J1; M%v; ;0Ͽ;L %;+-gm8;J9޲;|zT *;n;x;:<;rp=f\⚿gŴƿa𵷴A|gU3L#ilD`eqC:60FYu9X2iyž]1&`}$Hy!e/;##, :UB-~=ЃJ0 y}%>ezش¿iU>Ҡs$Hf?  ?G˴GQ& 1?^ԧؾ>?kٳa־S?ssm{q?x/(r?|5 $)XTz80>ﹾSsƜ0pEi[0;z=XA182271`5a1O{VyB1j=W@07#=0 }>x|\>%R >|>4,{>6ӓJ|4>/6|> c*}> Q8ʐ~^g=d7~=j)dF='X=TI bc=Yy޲%:ښ/ ;>epԿ;-_>;GҲt;4;I<s3<9<±#|5X_X5:/XR4YڿVb 2JV d,uXw\#pa_fEnd ;OPh=h˴|,¿ll[v]pYaAtk@A8+lw~{G(]}zѴ :+& ᡿9=[4Z}'z>BDkt6>}]4>p6)C?3ɴy p?UOۃ?gG/`?qivJ޾r?w11?|a 0uaUy0>d;rrX 0t儾)ey0ʿjNk|1i0HVY1qn`B61h~1!<٠0'~ڊ=a/i}>п|-> HͰ]|^>+|I'>-̽h|>'J-p}>)}<>ak-\~tn=୅.=J!#ȿf=˱t=UGSֿ-=!G֡<1<<ȶB_4<|!R<='<)<`t<c< v܃<$O#bE; ;ۘOa0;ʚκ;ư;Q.=;;DԿY;Y@ݿr;Iݿ;xN ;dY ;R[}Y;As%2;1M/o;;#(8S;pʱA1/;IC:SoQ :\ :l~j:hf:CH:V&I: :auL:J*; Eҿ;L۳$2T;\l^;Чÿ<K"-E<`5*Jf@n>LO{?g3:"?E:!J?`m!f?nv⾉s?v5>m<5?{~ վmyJ0>ep$0~y`_0,B=1@ʿ`Gk1zNO*1=|z@0=P06`~o=/}*>iLJ}%>vj|ȳ>!{mV|8>#5㰲 |~>]<}j>o }=ɐf!~=׭y!h|=?l=_=R+![= +5D rx">{Y>Cf>s>!?+m#iH1(.?Tľϧ?i䳕dr*?t־Y0OHR߿d1krZp1\+~h0hTt;0pĬu= 0\~= R/Y0}'=ۮϿ}>V{d}<4>8P}2>[u}cz>㖰⏿}>T~==ﰡ+$~=sl&=BIw= 6=M{=1ڿ<TfP5|شj6lP㵴y]gnbjdp侯XPtr̾=5,xt|Zʴ؊vʿxtzcU8ٳW{4[}UѾ]<~mQ쳽;iz3IX㳾<$Zӿ%=?z,|}>#Ƨt>Դ9Z?ôpg)???{09PO?aQz?r L_4[?zCľzvH0I.]e#0#+6>R-1) d}b1`oӿ{~=K1e620Y 3}y>M3,}> Vᙿ}z>߰.޿~!=6Eڿ~=[m+~=:VB0=vEu=yWði.=Coғ=zc<?5"|1u/5Dw(2hd5r0ʾ]5Kſo74nz0ʲ݉nrgR*oվhwY0q}SYrkFtK澙 -ѿu߾i(|wt,xnn(z`Up͇{; ೼U|H1J}۾pn~'%]7PCX;p:n PAKoڢ>1/L*?稴O̿F?Q1<m?l\oAt?xHrv0c- IWo0vP<d1H0Pvf"1 i~u0Ö~j08 Ǟ=)q/ XY=e/ę~=i.˫~=ג᮫տ~T=鯥J5~1'=r~.G=h~K=ͰBx~R=!}~t=ňZ=xs\b{=Z=eqϏL=5;[=P鰝 <f,<ְvpo8[OD\(i>Q$V=:2|?/^&%3?aO5?u7)}uCj02j54[1,o( 1 }}y 0tӿѽ C01'/%}=;>S/m=/kԿ&=.:~n=g~’=Sïؿ~=Wtׁ~=Ъ?~=6`޿~=:߰޿=(S=V)c=y$=Kr"=#wп =TM<İl<Uv<ñHv(4N ۭZ!56۽5d}#x>t Vb>"e?E;辷?nxZKNO71Ͽe3/1h{y60fty0ґ/ސϿr< / ʿO=0=H/ҿI=j^/ڿj!=l.5G=m<ܿ/Y=K2kڿ"i=EΎ"i=E۰G /=+M޿H=8~(ij=0=r֋2L=M?ǹ=)l= m\{<ݘapR<."G*|<۰4>˿Af;`F.;4N@@;Wb;Sð`oٿ~;9j3;iu;QK;<>e;'|c; @;TW8:M:F:ʿ:sڰ:Apѿ:BNɿ9k#8ReOȿNȰ$o^8(AD]ñtұ_+,x@-tv 9E2k:iJo;೓(%;!y;ʳ`;賕^: 1|"@4CSٿۋD5bP⽕5V~ 5h,||r* 5'{ѾG_4пzjTh3_zMVJkUzRųҁVz¾J4̀{a(AoԿ{37i|W,rV|U ZJ}I=}*j؄~*iX~óKnz~IWPBvӿ8Խ<y⽂9 M8N˳7"Pճ75m7;9+8c#<9Q/-=' ;dqL=mA}>~VXBw^>q Z?O{H?Y6&_cþz1}z:MĞ0[}%̽/lKآ/;U|/7f<./ky{= i/&`jn=1q.߿=L}eU=`^=m?#d7r=s+yb=s2$u;=k㱰/{=]VwS=I;tC2=1Er=)U{@~3_2Կn𐶿u#(^=k,hFm#H^u2:ۿ:1afӠ:߳: 9: :6dٻ 2Vj 34=Y%4݄g5)~451~-,4-} "4vG}8þs3Zn}*핳+ڝ}Eq}}tƾ=} ijB}设qt\~%j0A~aj~i]R~@V 3@Ik)m轈,"+i[?A-ϿDSh4BT>wCؿ< l<'A[=$`F^=2}e><&ҿw>/yUE? \j # End: Data Binary 4 # End: Segment 3-3.11.1/test/testdata/binary8.ovf000066400000000000000000001666611503346766200166530ustar00rootroot00000000000000# OOMMF OVF 2.0 # # Segment count: 1 # # Begin: Segment # Begin: Header # # Title: Oxs_TimeDriver::Magnetization # Desc: Oxs vector field output # Desc: MIF source file: /home/syukri/workspace/oommf/std4.mif # Desc: Iteration: 5, State id: 20570 # Desc: Stage: 0, Stage iteration: 5 # Desc: Stage simulation time: 5.7528e-12 s # Desc: Total simulation time: 5.7528e-12 s # meshunit: m # meshtype: rectangular # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xnodes: 100 # ynodes: 25 # znodes: 1 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 1.9999999999999999e-06 # ymax: 4.9999999999999998e-07 # zmax: 2e-08 # valuedim: 3 # valuelabels: Magnetization_x Magnetization_y Magnetization_z # valueunits: A/m A/m A/m # # End: Header # # Begin: Data Binary 8 @w!BҀLV!y.!&Sx@sBxa$6 'D+L@>]v%k|oD̽H@}3)&ƳGxvZ9@lyC&d~@(&,[nZs@<y&ߢÉlj@T@k0 'qJI?ɩ@\]':FR $@ 't[L7"p@_6(ɸw1A2Pf(mdJ֫g+i(_|OA2]@6/0X2cg(tq@c\ b)2h(D޵v@]fHh(Mho@Rvl,ni(LaY@V[ [Ѵl]i())b@FԿHi(lإ@L5D7Qvi(ܪ@UIA듳Ti(K{Gv[$^X*i(;=6ZoW9/\i(P=bUjb2"oIi(/:פs^$i(Xs=*lb i("P? HoOi(o@z3| $;ti(T7T?w i($v j!$7vi(`?wCScSi((2VKn;&i(ZpȰпi(8Y&Li()ug$! i(|Haj˟AnN$i(<ԉ.S[i(2-,tYյti(+ O} t_g%i(Dі\_Kf>i(Հ>W@X[nNKi(2Eu@fOc+wLSXi(@΋U:{i(eQ@T6@.8i(v]O @Atdi(-Oݓ@Pki(0آ@Ӫ-#qni(>x@ 3۳ei(ym@dI i(y@m"'4i(:0@dqgQi(X@/:7(]ci(F}~3@OSȴg|B-i(Z@,|, 0^h(W@Qh([o@Ό#a AWh( h'@W U͵#h(6f@h}Ig(@`G& @̎`h(`@KD_2]7h(묝@Ɉ!d׵h( s->g@(־؅pi(X2dV@Fflfטi(t #@h(A4\i( b@n)"Uai({24g) f(d')k)[](vJ(lR8>a:(7>!-!l ܋@8'P0W e$,eR"" pRm@lG:[@?| (@+Ƞ$Azg]@!MYT'AtO ׺W@lg0;(A-fEvgӷ@A\(A`Fjp@lr;h(A+g.`Fմ@sT~h(AslL@>z@L\(A9@\@B)D(Ane9@L@r8'(ATu8W@GM&@ ͣs (AG/@"ANߐl'AWhAoO6xρ'Aa^ A^;ˢL0'AAtЩ3r&AQ 5X1A6\{$ASUA6ܮDJsi(0A5r'$AyN2[w&uļ$)Y@5V!I!.s;!"*c#@$i}4#' #8 @ԊJ$Ak.$_9Uz@DS$}`Mc_6@|,V%sC K@M2ҳ%F4ҡG+N@e%/haI%ڽ@}Dtp&̌;l#@eD' Ee ,Hí@0(ƁWz4@|>'\h(eVV0ՠ#ye(jnF@d<:ᡲb(AXi@gd od(4@R&g(;cJ@Iͩ}̶Poih(17@hЕgyZi(ҝ,x@G0KWi(LM&'_O@_n%VӼpi(k;,@ kD|)%ri(T:q@!{Pi(}Z.eoFch~!i(B }AHi(ɟ-l0i(DF^w0"$zJi(f4ʍw(o!Di(,v~hq-29?i(Kٵxp07i( ܢW,#5 pi(ӟR$E';Ri(Zi8)ۛ+")i(A+@Ѐ/Yi({Bʓt~34qQi(ZXK6#_PQ8,7Mi(fzƖϨDQ<2Ui(tU vdS>@Evi(2V$DW#i(Н@8H6}i(l^* Lqqi(q*Pki(' b=pTTi("]%_ X(Ai(3q ܓVŧf];i(B~労b6i(hU4Syh[~Gi(OZmi(h]6_;tɱi(Jy!S{cvi(n0^ahOr;iUi(fF@I$vwi(0.'s@@򌔳¿i(X@ฉ0N$i(ס뤖@R>N i(7@kaiZi(iN\kǝ@tz˳1i(a­@'Oi(eo@c6Ri(צ@[¹i({9 @utF%gi(F@Nzy5 J~i(U0@>z(k3Hi(/di9@0fqh(#x]!@b(Z2]h({"Ǎ@ ˺T:h(kD"q@] nvg(}LBU@!5~َf(4̴@%жtѫf(G(q@J &Jf(DF@WQ9(?c`f("'@w⪴ 1f(3hT@@@݄۶(sg(]V(@mD`ih(!Z@NsXxa i()˷w˽@ƮpL$ii(櫬{SIeg(VN#,1qbg\((zwX/3(ejڛ 키1y'iI e)+٨Ǩd#:#_h5@qa ߢK@&Dzr!@w;@d(R'@K۰A|Ho%R}@w $AXyC@e.'AfNq25: &geך@mU/(A_>9ѷþ@>_(Au 1@"Ph(Amw@0L@(O(AC~@pf9@! "(A:Rn(Q@wrd@ma}'ACkAIoYn\ 'A3\]-d AsѷA=b<1ci(.<"ˠ gfOi(cD缟j^}i(8)2nXi(/ r/r uwi("~#}wǨvi(X+mz{8Zi(Y($gi(YFDIu$i( ;W9խ|i(kuMiV,ŽCi(Q; r)]si( ІKQڋ'욳4i(9%;e1z:ϡ4Gi(Pl)i[w9i(YeC@Smi(K^: s@6`*i(}K@e*tų>fi(Z9@anѳe:^i(%!@|]߳,Gi(H,@ ^i(-n@ei(H|<@7]yٝi(7="@#D;oi(B{:4@6e(h@o( AY^'UhY@TݛAYlqb$2f@$AZ:^-85@ GRu'At>E W)@_b;(A*d=y7@mi(A7贫@9G∲@0la=(Ap'K@Tl8@?}P'AsssAV O'A#[ AXp(vQmL&AyAawARAE%tB&A&.ft A[n%A+A`fL6ݑ$A =\JAlW,ﮝ"Aen]mAEy0 MA2!6#AVJr@a|NAv]gj&A!!0~HA.'ATP_Ipm@*L(A2<"ahayM&ē@Ol$3J)Y\@ _Q1eA#۪X/b@)Lc f"c@T!KA[SW0!K ?@=؞") I.s ^]@ "q3@th$iJĽlWpq@ K&Ɗ!sK@_wx-(gx.(pzQg(Vf @mFJ;X0Q(1U@(voiRɽ3+N(vpc@:'/_W(m @ #a ̐_( A@׳jJd(Hb@q;6έX g(;B@{JU$gh(>n @N!I#i( 4*@kcѴ i(̭@iEJ|ּi(IU!}Q@!9xv-i(t)~Tji(WKAb 9?ɾi(@VŴNnr&^wIi(wrޣB`F=lÒIi(+e̤ =hX]LHi(fLAΤFg g&i(C uL(i|Ei( VY#J0'msFji(S&(/pi(GVDs%tlbi(7v(ܠ&gxgi(td!-Lme|p'i(:PJzQ*i(["Fi5teGA: i(lwC݆/vi(}'K5-ϊ,i( Z5lWYlۥ=i(FX7(>}%i(3yEa<Жݫbi(.&sŴrBCi(|V{h[Ro]i(,!sեbi(m 0#-qc{ i(x|ޏ;] i(:L pzhBi( =A]1}i(yij@hɳԈƁi(ӳ~@$ [ӳ6i(1Ay@Yr~?ݳ^i(s5@Kcc|3i(X#ԕ@Gڌ di(`C@!f Xi(|X*@K RtTi(@@t:+i(mL@)e\[Oi(pV'$@{ i(- fZ@l_Zzoi(q/.=#@8_Q,+i(np;@ I ]5:h(S+@:Xɵ*'h(̹ (@a. nHRQg(^Ji(@pYֶ]HE)Cf(vyh_@I:j&zie(zeu@wns#cc(̊ @c u:b(A><@ uh˸NTΟb( @MO_c(/0@qb¸kMd(Cبe@q@  _f(iZ@xѳGXh(3aR@@w0i("SSܦUX}'d(Rݴtfh^ ,H(ވIgO"!Tt'.Ux <.醂+Rw%f6@#*,"A@pN3u&ϐ$%@c|;I'7:6@mӰn#E_(ɏ~20@ +h@ĖnF(Y}@ߪ A% ^E?'t9?@οAc#%ifP@Ħ#%Aլ!cD |@-q'A@#E@h e(A+Z|>@Բs!(AM#D@0q&ޖP'A4 AѮĈ6_&AkbA3Fį 8Tg%Au`{A&N2.@Cͯ:F$AW]1A34Y6L"AK@3X?R(AaTs/"S@lbDb(A&xg (K~R&or|y@ ĕ'%vE׸@>q8}$yP)@fO#"4#+D"f@F`8g"x 1@+pD q/!Yr8)@_CZ{!ā|!įT @0S'"1^bU@{_N$.,I~3 @^&^1IW`\PԵ@(@H(՟M(m ^(E@at 5 B(L@t7UUyA(P#Ӹ@ྀnN(Jo:@}j޽K]YZ(cJ i@^j/b(8zo@k,#7Df(R)Y@ җsU Zh(a@`,TVLi(N@q8@t i(bl~@d/FRTi(fM#@P_g,!1i(v!K@ms/zi(]䒇5k%i( |t{a@𵣏Ri(s@vG;شڴ`i(vH f@Ё%pi(*2@'|zXTh(Kl@{Q&kvg(o"@:f(NEc@ /te(ι/@$Eط=Gc(^@${ujywb(~_x@>w#\a( t!I@HfO`(b@Mba灹Yw]a(#\Y@+CO`c(Z?i@fMţuz~f(-@z ;1b]Zi(t: @&rJ4ZSh(-=[q M0fW(@KO,ZyjV(AˁdoBxu&סvu1DN@g7"mKt 6lx'@=)0Bm$^,%+V@a꧄i'_@6mѼ}s'ƌP}@LzG(2i@_v^@SGwe(B@ꥬ@&(nƐ!@K|| -A#E&_R@bB75Ad#"/AI@Ξ̥%A/d_L@@y'5(AqM$s] @v'AC>'s A[Lp(A&AWC/A+#Ȝڽn}$A96 AfVkZ"A2]Aq»2n=9 Azp<"A,e;A_gp$AiZ !FWATp&AFJہ{P3ٽx,A'qa'A1^&}@?R9(Ay1iNR'@wD;](AwϩHXM@O5rne(AZj+8X$.25f(AB&)qk  o7'@8?aT˪G&P@9fZ~]%<=hz@+wB _Xt$q yP@$ :5#锽:@0@=Z:n"zR@@08 t]!Kn@Мs]5"uA 9Ok@ |$6KD@b;R'&p wű@i_(! }wQ( y:/@?]aN;1( 0@@HV4(֘s@qD(f1@7Z4*sZT((PR$@5yH` ݻ^( 3t6@Ѻ|<Sd(~@_?ןGg(SsPK*@v¶j h(*@&Qhy6i(+@)3[j3i(pxް@ᵳ-dLi(L̘İIci(?Jin i(yTaóx1Ki(=4+-o\kɳFG%i(KMB{P@ϳ=3i(pESaa@vz ֳZ{i(jw@pFK޳a/ i(Z1]@G 2Ri(Y%@.{t3i(U0~@]wAvi(3hK@tW? Ai(_4 7;@ϺBG7=ri(D 5@x=5Xwi(ڐ@TݠQ|i(jj@,vzi(춽ַ@q:ަ9i(}@MV@sVi(m;U@~L8Gh(Ol @. bih(M@ZDFJ%,g(Z@dx8Tf(S4@՝`rSOld(pl@&c-Kb(TjXJ@ .0a(Vʢ|@s|3$L_( @DL_(RQ?B@lyHeֽ`($@\m jc(2'蒨@t2?ESY0g(/n @%ֶʋi(Z@];?޴OX.CAc(F? |!P{2(Kc¹'"I-@L (& @,%wybI(Z@Gyf(S채@@ZAg6(u:^@!ZOA=&cwBf@:ګAmEo"YHe@{&Ar[@ûG'AGsAfƧJ#AFAQg[k> AqEu07"ApaloAb$AgTa^Aқ&Af >JfY,A#u['A}5]N&>@VW;2(AJO}Vd憘N@XT \(AR fs@e(A`w]ve(A\eNtTe1&d(A)0h30*yyHc(A=<ऌ< i=<.x')qm@r._ ٸ&/Sv@>|&H;A%hg@WԜ N!9%;U@rafv$JH;@q 'YU#nm@ RBFV]"QV"G@a1" "g O$m@vL$-Cݙ@{?!'C p& Co}v@?Yi(Ccc{ &tVAuIA(w>@ٰ+diU"(#R-h@+3]'(/[@IJնwt;(pb$@;,tyb%/M(tb@z[(>@ m,=b(:3@7uGDf( @@s(VC xh(IA:b@:[<Ti( K@9y@Ti(Qӊ@R'i(뾛v@*CElV,$i(t@ aQti(xiv NJ`@OGܳi(U7߀VB3~ų"i(jEn-Xdi(%MRxi(nSk8 r?ai(,1s3i(A@¹i(]D}܄ꍰmi(_ M͐!`4voi(šLJIIV#b]]i( `T,ubi(I6L"i(ߺsdI4^2J?i(>oOJ!58]i(lQuг{i(RLpI'ߜ՘i(mt#u-8d³.Qi(q A}ų .i(Ÿ.uݲhmʳƗi(hscϳQ8i(;5Æ <ճRi(zc@5۳mi(4(&w@\7Qi(tZE%@wj5^i(Z@b 3`Wi(5S$@:wQ]i(?dTc@l'<W.3i(T쎠@2 %i(/"2u@Q16PtTi(Ϋ@%sRŰi(= @}wwe~4i(DNt@tbM٪:i(Е @'4sRi()`@MrʹA h(Y2uΥ@}n2YQh(X@@! =aAd#Ā=@aRG#A/"AzqtŅ@SW"(A*(lׅM9@8CS[(A<{3ٗa@e6f(A3Ϩ\b(A֝)|8P](ANAbw%/l$Z(Adt":[~2Y(Af|v<ߕ cZ(A.cЭ<:y:E^(Az" k`_5a6z'+@"4f>n|'"r@1%'\M]X&?-=@k{S΄Vje+ݦ%36@JF2Q$yş@-q9t#Mm@w,c"pa{@ZI" ˴+  y@;_%-%r @?'xy@|7edf(BO @Vĸ2/()KIx@<}2eQQÄi('y9j^:óPEi( ۳{Ԇiųi(ŊY<~! ??ȳ2\i( *xP_˳$i(^ q(Tϳ>gi(Eڿ-rdh,Zӳgi(CŋO5IOس7N֣i(]sa@~޳pqi(x/t@i($@Ǽp7oi(U&@`@-nsi((]@}7߱Mi(,u+9@ÝC ~i(8Ք@lSy]i(=@iB:C2_i(>SD@ 1N4i(M@\s7i(s|Wc@U\ɥ`(1i(㣬t@yD=Ti(0<@nȄ Bh(h@i-oAFh('~@TLHu^=g(5ň|@%KRUe(tqI9@(Sķvc(K_.@Zc ;jta(qD+@ nejB_(8g@E"Nv](€!@Z=&Y](hIj@MitT`( 4z@$N d(BQMs@L #i(pSjɃ@BBϴ_JB@d(:٤qHG2*(jXjO=g'}-Ήr@ ~['$PŊW8Wh}@f69A .4"Ěj@:K|$T9(@ʴܥ%xS%@7Cw3&럗@µ$Ћ&t& .S@Ml}Ш8ֿ&UBJ@7H;$;+A&0@,@ꍚs{&.a{.@t&ZW-@}BKk&20V?V@cRDC?%60@"CNi Z8@Pl%GnBAg/WI%AY#И!:q Yb/A'Asґ巃{^a'AزEҌ?'A./Ҩ2k\Yi8'̬'(AԖh!cr !(AiYvS^-c,(A(ŕWĻ?T}-ʟ6(AIY+ooEA(AwS܃#F[M%L(A|2\JzxaJ;W(A +hzhӾ' @;hk _cCa0'@~>@?ƈ2:H7&ܥa]@j ;zٛ%qqV@oܺ5ht%Dx_v @\3{T'`Z(i`@}2,F (e^@ /뱧(pS ADI( NA-Lp)(s@|O!XA({Z+&@U$u&PS(='@;㟅^(@ sq~3d(b@-4Цo)SIg(A7dJ@Hh(%\78@^NEZii(D@xF+zM'i(ɜF@M-_i(j~@V@_i(_I a@)ԣ"F޶i(~ 9# @Dfxq#i(aj@Gi(G[j=i@9"iI i()̲@dN9ܳ i(d^ ׳_i(sa*xYjJЈԳT\i(fq$)bҳWi((Gt-&ϳki(7AxRγQi( z M̳\bi(y@ }*|.˳hji(c*2~qWi,ʳy1-i(~t7K7ɳeAi( hLɳi(7pfdɳRi(|T~!2ʳ_i( 9)|/`˳vέi(lyPͳe˵i(MA uMOϳGi(1-p[fҳi(왛c(ճ p)i(8;6EdٳLi(Z˜W@R]޳0i(ض'o@Ai(8YM{@gi(J0@vpz'i(!c:@NFTG i(@Q0@ۙS6l-i(A @eٰ]ri(? @|wG+dsi(|x@I F;i(C쿻[!@r Uekai(W8@6Fa 3i(0@M@ ,]8Z[i( Y@pj;fh(έl@ԀBojkGh(>-w@fPXIG3g("4¨@+Me(PJxt4@Xַ|c(Ϗ@yw.a(3W@LsQvH<^(]5VD@(JJ](I0@(:,](|7?'@ }AWc`(Ub5%L@1ʪStqf(@&hNi(PG%l m]V(&xc_g'taWJp@Mw&0J T|;@ ι">iQ]f@V Sgu@FVhTro%'eEU@-2M᫴Ja%z~@L7 ώ}%3˓&@ ^Ppv!p%UwR@M 7bfK%O7@wwZ$!xR@88;N"]"IOfZ@xG3="1C 9 Z^ @-R& ko:kޑܔ'/ANq]1"RAO,rY)XkgIm#AU|2+O8n%Ak>}7&A6&BrAΡ~9s,7x"(4~G`@zH;(Y@֚3)nNwM;CO(?_@vK½=[(.*@"es@b(ԔX@I"';ޙf($w+@ V-fh(ڤz!@.K0Ϥ;i( RO@ÛOyd7f~i(ur@L<}ڴWi(JQu;@'7b(])M<@E}q+q6h([@E!V7>#]Ng(;6 9c ϰ6_7(~I==VE~_{J'F A+0e@YGh$}M:9<`'@ܥ!!r UR@EkO&"Rܱ@ta(#'F#@s[LpK$Ò2@ ϣ@{$ZF0@ْ??ЦQcr$V &@(8X&\U?6$>O@k+#8-*@;>:bk`QX"cDJl@a I|!\ќul@:1"2=B4(@(m$ }^'X@KMfY'5dɆ #xG@w6&J8P(2^@hoFt%]MA! ,!o AA)HW3$A=HrEW ;ZҠ%Axƶrvc?)@&A~= ߋA0,F-'A]܊ xM'AeIɄ> r]'A{j]S>k'Arއֱ+Q`H"(AԉY[0׻zEB(ABY@ļT9{኿8'M>#S@f ͮX'^|@=)NYc 3&p&@?G=H &*@drnTeN,%@طNc#c#@M^2}!!j!J,@Sǟ$,v'Wx/@^i'wz6'5@'oJi(TN3t{THw:(Pf@践5m!^7e (VALE).'ix^_A z:e( $uAj_ix+81(;'u@R2']HY-Y7(sE@__^K(Z@$ dY(6x[@?(wa(|@1e 7=e(#p@2۟h(`ͺf0@uzhi(S|EK@guԐs놁0~i(y3@k0 }i(R~ί@N V04i(d[r@vhi(qv(Aaѥ@?C?Ri(Puo@&%jhi(r^Y7}N@5&W]y@i(@yzI!i(h[<@w i(='@#3i(B/p*e@ze<.i(H}@BF/ai(9, u@!54i(j@lwi(:0:Y@ѥ\ Di(-c%@/iݳ̳i(dTn0/okڳoՆi(uM+*ck ׳~oi(ߤojҹަֳ#i(ճi(*ؔqӳ]Ai(>q5uӳbBi(>'qMiӳ]i(ꡎo6ioaJԳ(,i(D@qj\@ճEi(+.Jc׳'i(O\4:SNKzñڳ)i(hA3@)޳(hڔi(Obb@U2&li('#$r@73l"i(l]@Oapi(.5@0}7^*a#\At@qǯu_#F@M3K~Ԩ##{I@JNkY "(,@WQ {Ľ!/@"+!9r (]|@T%"?9R@n$^Tͷ( 0@CU"&,ވ]~Ӥ@A M#p's=ZBˇ@,ܗh(I@2od85'XHAn{zpr$d"-yAhhi8_xW ~v"AlwqL$An,u{0x=mz%Aƒi xt) oe&A2%+<2$rvD Sx.'AYqRxCtv~ `C'AxYPX'AEw~uCC|f (Aה-dZ =*S7(A>GTdj8 'gmY@X$ 9]'l$@|e[v&*OC@R_M%tWo@'JZ,]>$ H@A`Q_? V#p@ "p > D@fGEb%[*:nC@8ɡ(4WZVb7UZ?a5d(:5@^T)Qp,(Q0/@̊:DH(u{AӥzJW'~AMTI(Q }.AȤ47͹(>}(@ dvxY\3(3x@RD;H(ϭ@D'/}azW(@-+T`(Qgz@mqι<)e(v"@MK 7g(? @CM2ܿ?h(+ln@DZ׵ ^i($Ⱦnz@{ݠ8 =i(X^ @Q δi(A!?@)DW1fi(5f@hW\ i(*ŔХ@p?R1xi(e@+aQi(zA@/i(`͗@GvBtZ"5i($+Q%@Ooճoi(fp 4wԳ0i(On#qT@Գ!lai( oHpJ6?Գvi(nGMm.ճii(m{}hm[A!׳<㨳i("B|`EKnmٳVi(=:#Efܳ< i(]xV=S@KfoF{i(՛Wn@QZ.i(kX}@q7di( ̃ `@x7;i(-}(@bK %/{i(I@;bǛi(D1@MT.?Ui(߯@ĵWlEi(YfL@kϜi(|@S3̵JX*i(9%0*@/t{d퍋h(PS@>ng(:2X@THwZ*ֶȇ)e(@+z@tfȔԤc(*W@ 歸Fjea(t ]@V"g^(F;@s&K_ ](#6z< 1'v@x#ё^$ # Ty SP@.fh%})vwdE>@~ w#bCthE,@S,#"8V ^кm @#G p!{e!ZV@!D4! P!2{g@qL!Z~ 8!;Ԭw]@]^|! wS{n@ p("jO =@j"i2|@XZ##T]-L@M:w$L=EEƀP@'o e%"N.W;@X&6sspz o@=?')]@ Lq*@Vmc'J*f!. M@pr!c(JE_ߊCӦ?Tuvb<(qy@QY`žof'vwA:o/-$h$tcȳAh3ipR XYd!A*8Z BH$A޽3F[{#j oē%A5M^AK8*7Tz&A@Aie-"'AJ8Ir Wuن'AшIaV% R}VI'AS* TB (ApWCL"DwV'7NR@l RV';}j@ձ!9&FWe@kS85D2ڕ%€@n?nckM$ <#@1E NZo!Ooz@O($1< B@?4'L j @ֽh(fxvomHtU$/K(MN@q@T*(Fb@̓Z5̦o'A,lU'{kBtAى&(۶"AIB]u(l oz@FDv4Jr I/(~ #@v1i7I D(X&+@]sgRAUT(GI3.&@ln*ȼZZ1^(gs[@3Wp}c(a@/*t▸pVWf(c)Y!@g; 6s[[h(rvg@&: i(34@Z Pvi(N4 @\{1ѣi(hsvAT@s?)bȴJdi(u@f﮶蓴Zvi(s^z@}XƄp i((~^@NTWNi(DN6@+Ejji(#%Uy@O6G잱i(?p7@{)Wxqi(SJ@_is ]i(ͪw61˘@ O"i(Η@:ha i(ħPϐ@6`Idi(HV@5i(@@CY 06i(4pi|@ZpXUSsqi(;nsq@kx–i(m +`@T =i( t?Y-~ݳH-Ui(K1"\<ٳii(HàLjEla:ֳi(fB4r Ѿ1_ӳ_i(lv_B ѳw!i(/##,z!Cϳ6xi(Q|L%4>γi(, }Iͳęi(Y~.ͳci(0]i}t!ͳZi(zb?l`ϳtqi(.FWuiz@ҳIi(&Xk@0L׳Pi(d(PEoZԟ޳tTP^i(w9@t@!Gl0uai(mJ?w@E}ofi(JHrQ@{]Fi(s<@,_?Ei(KqsA@txJi( `@Ͻ7ʴwP{=_i(*p@0#9s =h(0끸@3y̵ g(| U@bz܆f([\?[@Pd0Gd(9 s@ # WYzb(og_Y[@P9WL>h' F@ )/ ֢J'HnB"Q@bɒaݪ\X<&qY1@~dL14nBy% @9 >ݲ#?,@jOK "T$\ B=^@~v C&;.@$n@()%(Q }g(J*tc@ȑנ0@( 6^@*dӿ-t-M:(6@eDŽ^^ӊ(Ro'A5Dpc_\'/.A$(O]JAu^X\㯮(lş8 @ϕtk:/(r y@Ûu94x:%D(igZ_@0DYRBm#lS(09/ap|@\;`9so](c@N wuPc(44t@+ sʸv^f(&rv@udHK%h(P0@r(cG`{h(vm⚟B@1gȯ_i(\bc@2q5 ǔi(K@E@ɬwi(n=O@Β#1;ӿi(R@/:oi( 7>@pk$2|i(JIF@*:hVi(V}@EWRi(2NF,S@'pm72ki("Κ{@P8*4Ii(Tɝ@~\3Ui( ?@ ԥi(l6@;u` i(쪖}@__Eد|i(tgu@?o?i(8@Jo ;Oi(3 x@@Ԇi(Ρj@HjYߦi(ޯI@u޳+i(m݃[Whٳ|i(B#Jkp*\hճki(PАtWѳ(i(zv6γȑi(uJ-`˳"Ҁi(trJ+ ɳJpi(䏚ߢZ>2dz^Bdi(#F{Bų\i({Eų(O]i(BH-݅|ųhi(d=a跄QƳ]ki(N?L  ʳ&|i(YqFy<)ϳ i(UaTduٳ\gi(E?@ s@b^v3i(v2@=nQ9i( E~@zˀ%%;i(WoX@Lv)[0Wi(`or.@D؄駴O^zi(lg7@󱞍ޜ i({#@'oT@h(]u@>QcPf(D)$s@q?) e(ȥX@`u0b(WfM@s(H)Ua(xnP@` hҗa(\@pGSkOe(/0~l&@6÷ai(cU`³zJ T(j!ن#' I=@&eYIw-H@GR}W%6gf(&65@C(1#^ږ@b wu%Gd @1ļ &$1+Y[@`|פ&"@Z*hݵ@b#]4'EPX%;U@[''2yqb> LF˙@b'("t<P7Y(K$ g(w_@>Z_)X9&(,@8E·)'545A^.īI<$nAjw1)06!fL!A12<7U}Z*;iGn1#$Ayr,UIl6%%AIkta +&AxAy@G W<'A8Ie:6S%'A^'D&N`},` (AjgJ%q퀝-?'tO3@ e 9'!b,@Fdm/c&A{@io~ ߲%q@N6ݶ͚"9@@/#0n@zaA'|7@ ek_@ZZ(O#y1'Vy_(sn@]0JEu9(-P+5@KSm#(gT9&@]ʍ({^8AoI(g=A*G|D;\? (ɷArI -6N$(O8)Ns@k[D|Ǜi1(ڟT;"@(VD(_Q@Eϲg: S(`z@C\ x#\(RoE @1jźBb( T@Y(E~8f(+E@܈[wg(jU*@p)'Th(θ, @y;/Rε2VJi(*r@LMIQȊɅi(l&~j6@Ke i(B@Mo´[;i(6X@4N1뙴j[i(gA@ZYc|DLi(-W@|5ܤe,xi(J1FǪ@ϰS7Xi(#2@ C͍Zi(Ȃd@$4i(3R@!+%'wl-C}@݈ ˛%XX:a@SsL&9F>6 @2&kePҶ@y7'02Wg=J @Le~v'"0 >_Cb@v&'d uʑ@av#(*@]Dڋ$V(z5Cyc,?gi(j&[o듦ɤ7P(#j v@I j(B' XS5AS!tJW&A&zA_ /q2eE#RA3Oyt mmT"Aޗow f'O %Awa9biK&A\ؕw$)'AdMVP-R('A#܂9+;_h(A`FzJ'-b}@9ג!'\@r& @X%&P>iz@*멂~$0֝eC@ӓ9!M.J!q@Xה%*p%D)a@gx'ܡGDՒZW4Ki(>uq˲V:V(x@={ӻNb5(tb=@{ȬNb(0H8@xm"f(יA?@`f˨g((&@>~dh(zM@mқ)¶6i(CZ@/Ygwi(~mu@P|i(E(@,϶ԴJkDi('^@$,WhV6i(C!Գ@O ֊9i(@Œ8@ri(F䯐o@?]ti(.cZ@Kxq{Lݛ!ii(td]#@[^D<QJi(QR@jʃ.foi(@@!Hy!Qk~]i(x3Ju}@mSޏi( r,l@m i()7ҽ@&Iu,Api(9@Y׈@vi(I`}@BSC,Pnti(<&s@,b:>i(2^K>V@ |ݳGvi(H)KZvXv' ׳B( i(պqlг^mi(5|Ƭo$˳i(ڂ޼{ųLWi(;I6Xw&- W#i(yV5xJռi(Qݳَ7丳 i( &<Ư{N`Lxi(Sxd&m1\)Hi(3?%l'i(ƩLhYiF$i(^wZLi(՘v?ðɭi(Ns4ɍL@i(S i(m 'SqRҳ9"i(\O@U ǤBoi(c=@4InTXi(L{j@+1$\Rʫi(L8tx@6vw Cbi(S5{'@g_c2͇h(% @B->mѵRg(U ;~@Y㜴X^mf( w@5g0d(@k(SCd(?@Əχ7GWOe(Գ@ & i(+; @%`,$xe()rO?uHC2(,WoE.'6D#O ~lY@5f&0lt;늹@tGb%eۍ!C@Lp w$:bd#@iV߇$`y͔jJƆ@~F$-O4}u@'u<$罪2LR@Җ%6cͩ&\@Jl%4Rh߯B@_%s(N@`hC|1&tzWC @A& e*I(@R&?E@ں%S'jŸQy \(ɩ@"O'ȧ̼C+]4@o'Գh -;y@f,( 2ZhJPӛdT(?D ﭖ܈xh(n*Deg@ Hb( 1&@}G0-@&)(r[:@-&à(^ke'Bt} AQhԍH%idƿA<m\Ur! <; AacY]nYib&S5$Ac'{ 89gB%Ak `eKPB+p &Ah2w [ Y#Q'Ať({F]'ABtI?cL^v'#_@S$@kr[|& ~W@Lw 83%ĩ>@T9LM#Ai@FNE#cۖa:~@@H_Mc 'nz iME3q@ĘJ(s8d`|/{h(Աt@&Ԛ'j3P(܉D@PӴ~0]5(k"@3:Nf^!(5}@s|Ư(]B@p0K'(Œ@*3%Dqڇ(MŪa@D/Tɪc7((=-@yQXgQw7(\3@7춬@U,hG(V1m@oꌅ T( /=B@hAXpӼQ'H\(9@TŅb(b@^8e(XoRY@-ޢg(.5i@Sjz=Rh(<0}@f9&i(rdi@Ux*ki( 6$Gv@ nǃi(c޸@b?i(@p궴~8i(% "@M@T]L•,i(# l\@cF\{m/^)i(J9կ@aJeR\i(hƗ@MR.F2i(#`ͧ@5ByAi(=lX@)@2+i((@ ̴$%i(uAl@qrGbbi(Y~@$k b+Bi(?w@) 6#wi(%/u@!T7i('`ρ@HH.dyi(3xs@0s˥5i(tYmvS@XQK۳qJi(n}avdYӳSSi(Ӥwug%c4̳1i(26MUƳji(Ѳ@cSc*i(7=6%9i(oy~JS$i(v>0{.i(IJRŤ"yCi(L-8\ >yi(Jetޗ%ӆS/i(+D6))vgϡki(pZ[V $ i(<(ęΠV`i(nn`I Gi(?dq=Ui(  YPд3Ci()摣iȿӳܙHHi($:@#ayPri(b+#1@S33m}i(}j#@تǜi(4\䶿@ETݑmi(~}z @ƈû֥[h(5O@ɒʇg9!Z<=g(X@?̑|e(]c V@~^ԩcB re(dڗ6@$nf(AC}G@d?;Ii(~N@j!Rf\a(Hhtim Vج硽_ (LG3CBՇQ9n'To ~t@&Ix&=˧OÆ@ j%q9fa0q'@xb-D%Y]kjQ9 @>'%׮t H_9֊@eE>%Xm8os@ڥUMt%J#L(@E%][A-?Ⱥ@p-R& 0b䵼@idY&L/Jo`@;`&Ly' 6ŵ@vʳ&LEE@ G5nE'{. {s@|c'޴ 冢@FxHl'Dyy?5@9|q(A34F<[0J;\4( b7rY !՚$CT(lBoKd{;Sf(4َx馎i>'&_:h(pe0@Soo%Ѩ)P(lh)t@=v]'L.XA:5&6WA% O,l#h2 `AUx9$ = ٛ"AԩDV.^,S6%AA3c4#Έjs|&A-m /( .d'A^$G%'A2vҮnS'}@O&:&Y<@_uIS}AnL(󿂫@*ܽ fߞ8( qČ@6)Zx((fg@r‘핤 ( ~_Z@P[n[`(v@P#2N$(@ j8h/(9%@x14Dq<( >s[@SZGVhz1J(@HګiU(4z@Qlv- 1](i@Bb(v@uRe(g@@e7i3 g(y@)x ~sh($*@3&C- i(?ER@ ܁"/bi(S fN*@x*"i(\U@qX|:0rWi(jC=@~ȥx4i(3^@)Pmi( +c@![|;xi(wۊ@`i(>P@F4Ui(+×Ozi(OĜ󇛗= hcxi(Hෳ-Fa i(E4?QftF%[( <6(~}č(z,=EĜ'u0v6k' Ix6g%@Ë{&ע8O0߲@ v@ &~SL@n{%aL|m@}%%\46@ײ19%9CWn@h@x &drJG@ANJ&&,eY@`,Fi&-ᅽ8U?@&?Ӱ}/n 8@V\ '"@#!@_ pH'񤍹Y w@7d'-Ӟ9 V&ȣ@z'S;Y@j' ; Ye-T7@3X,(:|O( h~);(,dס$?%5T(հ`\5糥Mb+od(^J%D cti( =,"@dg La(v׈{[@*rUꖹ]d9(A@i G\rcV'QYQAv;,hYq &j|xAo/yȒV"W An $Aϣ C]-@^$&A2˂b 3 Cd8'ABLrfEg'Afk;lx [y'_[@AKGl&6@2a9x"h$rΉ@3: 7#50 b\<@ '. DBy1@nOE( Qm})i('p5ғ@^3=dE i^^(( @mg.Ҡ5L(BY@t^QG<(;f@@6{Y1(՘&@x`Bj+(|z@R]j8P*(mܫ@7ƀ;z.(< @cU,9#Jd7(}fC_@Di Y~sB(M`@Kyտ,M(VOx@ѫˊVW(Q7@``4|^(Ex@{P)6c(Ćn@êɸU49f(պ~@9@-ܲg(^2;m@2Vaަh(XHH7@NUi(\?t[@6^i(?N@++pi(] @lkuGO~i(8@oׯ)̘1i(X%@,"?i(^@8>>Gi( ~@c=h^,Vi(mj|@,TiטCi(Y>O@9]AQL i("2CB@+,07E7'i('^@snF!j_i(}@/XY,Mi(ٜ@.?ѽۤi(zJ@t@i( @JՉ@&kAi(sfMB@``#{i('$[[m@Uٳ!i(IwTB?Trгi(-qqD!dzD/.+g(Iz@ ÂZ'in{ Rj @'8IN1 r@M1db'ީ-"S @?RЛ']1yI炸`@b*(F>աA'NZ%F((AP?Hpgȏ>VuB($5l7;؅9?V(Ϯ}*B#.EFm*c(@m+Kqpii(B+CXԳ"vU g(o@Kn:kW(9Ou@}CIkB(ysKG@Ù _'^2 Ac)Qbb%o2AQqts C~d D"AzlYGa7%AG@Z%Aq9I/=ʽ&Av)R |%*G"X'Aޖ* E`'H<'@!XN>,^@s-\%q@pW' @V &mҗFVDc(HT; j'aA}_Lg(?vY@iLB]C E[(~)6F@C)璼s $N(ѤՑ{S@7/B(+=@傸%E:(™@O @#h^ 6(cȳH@j@͝5(m%@#]h9(J@3z|?iv?(ȋ݈@ icH(CL @InZ?Ҿ%HQ(Ej@PԯIA6H%Y(yi@z H_(c8@,64c(iq!($@^օy[f(2@gfwΉg(I`S@͊ltܛ$h(:Mx@@b ìi(7K]@7́vr^`i(5`@T#ٴ;i(K(@,ˡi(aW׺@o-q>9Ufi(ADWZ@)j3H%i(lYy@ ʙysi(@{7qab{i("i$@t[LP-i(|@@c9ri(-o" @hˉe(i(@w/T]i(>v@ś:8 -wMi( i@i(rȷː@)HeѰ~i()^L%@5* f|i(jz@ΰڳ`,i(yGa@\ гwUi(RW`Ƴi(xnxNؽ@i(IʃH񾇃v[jXi($ U(Ei(r=h,OY]i(II>LGӞXڼi(+$Fq@__՗}i(eyP 8 ޚFg(sؖ0>?S(`װ2JB8 ( zG͆dl(k' S V@u'ߗl/?riuH@I&KIMO8 @2Ļ& b "k}[@ &޲NK`uZ@@z&'D%UN,L@SM?'zM%y@: E)'t3rc7Ѯ@cQP'Uj չ;a@2~v'@S# Vr@9'CyϓrܓȠ@^Gpp'e@dM;@͟f'-N Aˆ@?(C *]XP@CT,(Al}e;Jb;|5(kcC:YW`DI(љ8G_>oyGX(b[v%!b(Vt@_xv|-|t+h(нk Lx_Ci(ar,i@T'A ?c(6,2h@0VnǸ[iL(j{@ЈC}sЍ($¯AH 4l&*A4bE-y%#&ShMAi_i9(DP#AT"M"s0{*7lXp&Aw^ʘ^_5'A/bGo5"I$'gd@ 5oIc$Ŭ7@O]#C\p#d>.@ғ='3R-< 1d8a:v5"'N(͂gp酧si(c v`yʱI$Ie(A @=| `XI.[[(#>m"(Y@p38FQ(Bpk@$dn6J(W\@W8fiTZD(oBP@t\ro=A(nʃ0.@_xHܿtQ@(H>@W+\dC(/S@;͋6H(xck@Jr]7TiO(k~(@ ;ncV(W_@QE<jߗ\(X:!Z@1VĺEda(k$?@|&m]Nd(b>@Wrd"$8@f(,cS@\!"n h(3D@ϑZ2oh(xd%@ǵЮK ,i(8@{H[@vii(ƫ@_%n i( A#B^־@(+JӴ_' i(̩@*tǼi(2L@ѯԸi(r;qݲ@zf hB_ſi(\>d-@ͧPCʇi(,n@Q;s[ni(<6 tu@{')Wmi()@!VBi(]q(@̤O(Ti(á)@cipi(m@gçYvi(g hȌ@N1V?Ui(?}|@0w޽ճvi(Gd߾q@b !˳a@i(_3"a.\}4 i(Jׄ-qت=ⷳ Fi(Az i(֊B@L)'S¦(3i('f8`ОX|i(p;- Qe 1 i(3AԏTi(c%Tљ+|wi(7"ji(-֐E1?z'yСi( B=ti(u@"N!;m}i(6y瑤cg@Kji(AD+VTassi(+ ̣k\i(C!$Ƨ^꫼Y i(*L>X9 i(`n5[[i(A6$brZc@i('sPwi(W'|㍳Hi(5{ɯui(M\!M@<83^"i(:~L@ۥg9mi(Euu#@FglӤ|i(gVV@nVi( v~@h֋4-)i(bWĠ@VFyKi(ZBDA}S"f({op%!'XrKoQ(Fn0#/EZ{S-~(ЅFG4؀yhݫ'kܸ 1@_Ure'ſr 'zB@KԔ5'`N;|KbNŭ@F('$њN A@RU 3'srjx@F90H'BJh\ v]_c@vs]c'O,Z ^o2@wx'bT BZB@Iܢ'},@kM'=x/V$Eۙ@$Qw3'0:Md"T.@ +E|'- W츁@~p(_:ɄW$-9nwT(4 <0d]vo/(+| 0cRNA(k9?M|?#8ry4O(id(rk}@Alݷ}طw](Hhl@҆66=*W( @hϻ)AQ(b@lW4CN( 7 mI@ LXTYAL(:@@v+L(Uޘ]@o#]s$ N(a[@JWlTQ(#@V+'MV(&^@sSz+D[("q@d~cU_(xq @PcϹIc(Y@Ut^+Ze(E.@2zDK3-cg(oh@:^e6J]h(@y@bk] г ch(P@3n爵Gi(E`p17@xb(}*:{i(y<5@vi qʚi(8-wm@mϫ =i(1ӷ@>₴Y;i(,@˰FdbiY*i(~Sض*@T]1TG;(i(bc77@\['0+%Si(A@k Fi(I$佤@HKR 6qIi('MԴ@PMFݒi((}@‰9DGi(KR7@v_߳ׄi(NӴ@E@ҳz`i(U͋G@hdzȒBi(w} w@ UH(^6i(hZ(S@`%8f2Ji([ʓph,J$ɨĐ0i(y|o~石:Vi(O_m'z\+i(ۆ$)C*Ji(O/oWytۇo}i(p$|fNIi(*u1-|"* UyPi( i0UvXr;p)i(g P<{kŢ 5i(d0!naId)i(o>G^ i(OD%+mNtWqi(I^&%CQ; i(%rOrKąi(hs=AEbi(|8ҪA4ni(LZɩ>vb?~Ui(+?i(CRD!**i((iFd5O`2]i(Mai(Ժ.~23i( ɨBt\ i(Uf@u3~dYi( Ad@T"Ni(HxOT@E;"[5ei(H7/y@k2%Pi( T,l@̧ƌ?'۲i( IBI!U]e( 4-tݯ}S(l{k]d%( R(H^'F&2aC' 7' р@5&'~sLS@a@n'77 v@]'E+G% e@%'xYbb M9@+'/ijxr5]f`@ uI״'HW&@"'`/3W'T@:'G4:@Z'F] 0@3`:(R^sF-ԏ[@T(-:WbuHK'"(\dBodbnx2(0 jڗb{+ @(9#QF 6&4L(:N5VH#wV(/D#r%j ](GB_ΩTc(%]Ŋ{Yug(8;0~PAT4Oi(T΀5i(N]@Q埴n>gf(t/%@U~`\(<}@\'\Vgr:( qh@{j W ;'̴cIAmɉQQ;ZV%+cSA&دK ;݊ e["A4>69;d&A)W1#.,o@(cca%ͺٛ@33#5]([C]ߦ_9P\(FW5ݣ$ {i(f!+w5hh(XD@qd;&z#d(T"b@agŁv}B`(:@_AT](LB@ j9Y/Z(a3@m3T^W(vi{@Mdj;V(0zh@BӻL4PV(Szg@?*ػn5W(m<+q@р73ώ3Z(h@&sX}](݈@a;`(j~O@#!s~c(/t@W[P`e(l)+bt@ћ7 X3g( 5l@ֿȶrh(V6e@Bch(*ip|@gu툵__1i(,T<@ Vpi(;O,@,M˴ti('f@qf`Ni(WU[K@J`:^ݼi(-?@@VGa,i(E9=j.oi(GTtY=e(,b/>i TM}X(txu/io /n:(PO ݂YeB(!C<Q)s8鴶'fX2Im~@C{]'V;|)ڃ:@b~'aQ'nX@,'j]_9u@,('\FO|N@ 9'AepSs@ޑ()yc(/7@R(er3_%XqQ+(Pc3v=;c'(wÊIp1h82(.<(͸ՄRD F(Q8mEyJ N("n.=!{|!V([sn--𴼥c\\(EƥhDj+q% Qa(q:ANlYԇ e(DvN?pyMg(EzjdwvT|@&ݻCߐv7(8j%|@uA۩^4'B A,^-&ʠ$c!AxFzaN/eX$A?eޯ;:R%vTe"@؜a'o;; &NEe}ue9U(=BIQrZ\Rg(ʊÈ^i(EN@hV4Z$*h(ގB@lҐf($ @y8nd(`|@}[b(JN3&I@ulPͥa(@PRX@c۸{`(ν#P@4luӌ6G_(Z@+Ld|_(@Ȳe`(F{k,@GՐ9a(&—@pg~eGk0c(Ӱ{}@0cڗ6d(R*k@񤌩ʷmDO+8f(' ы@K$?^hg(g@XeTGh(P}@6 ;_bh(_@=Bf >i(hJq:@Y'Ryi(q#`e@ %i(1C@\2i$i(9 ૢ@UXa8>i(Lh㖲@NԳi(yfЮ@rhd~Mi,i(Z˵t@kݳi(7Gc@{)ʳ2#i(F ^@L#ei(sd@vd y,i(U^@fa4mmi((9@VFUi(a9]@B? d]ni(2Gu@EA|~Di(⹋6t@='u2i(YYZDX@sWm)(i(E}-G^eE7!i(oW|tB\x^EA]i(73(SQZW)o(i(TIRM UQt\EUi(A3p Ri(.ZD4[W8i(\1@r05i(ksΧβ M8Zi(G"쁬G'Ϸ8f(23QnaJb_(L9-#P(&1 N,<(8}olj8wRDA-(*.O̘be7u%(/ש5$(ryH~G*'(0L|rG5+(9ӱo`)oZ0(iJ[Y{U>]=<6(ǧ8ݠ@gD<(ژ첔WuA(nmTjG(+s vfkKM(,AjQȸ]5R(LЃI{KW(I& bU[V>a[( o)0֧ʻ0_/_(k-Wh?(jab(I3ڄI0>ݗd(W_Q(xCf5}f(-ƛ&Oջ]h( MxY1B⢿lRi(yC+3} i(;#mlL1Ri(􀭱@"6 *Wh(N`BH@˵e(4IÑ@LZqb(\(eE@ØFQ;(W+x@ˇv1cڧ'=TA!_*c:0y5>:0y5>msB_ext_peTW; <;;ը;bq;J;;T;;;F;&;;^7;kٺ;4x;;;6;M;8;;";b;4;;T;;.;;^1;@;; ;L;;\;%;5;};4;n;;2;x:;8;;;;E֤; ;P;ק;V ;4y;f;9RQ;{;; $; ;S:­:D:(:LS9LS(D­S麭 ${;9RQf4yV קP E֤8x:2Ļnǻ4ɻ}˻5̻%λ\ϻϻLϻ лϻ@ϻ^1ϻλ.ͻ̻T˻ʻ4ɻbȻ"ǻŻ8ĻM»64xkٺ^7&FTJbq;ը ;^[;\;;jl;4T;:;3 ;ex;:K:\E4:p9p\E4KϺex3 :4Tjl\^[>寻0[K»tǻc̻F6л\|ӻk?ֻػ aڻwۻkܻݻ޻'޻T޻ǭݻ;ݻ;_ܻuۻfڻ6ٻ׻ֻջrӻѻ лdλҞ̻ʻȻD"ǻDŻdûo߻"$xJteҰ^Bũͫ;O;r;sM;-;;;`;;ڼ;־;~;;;-;s;;=;];;;;';r; 9;;;;;";;W;;Jl;o;l;;W;*;X;޼; ;.;;T;x;;k;;M߹;;7+; ; 0;t;F{;b`b;IhG;!+; ;b:#:A:696A#b޺ !+IhGb`bF{t 0 7+M߹kǻͻxһTֻڻ.ݻ ߻޼ỜX*WloJlW㻪"Ỡ໙߻ݻۻ 9ڻrػ'ֻԻһл]λ=̻ʻsȻ-ƻĻ»~־ڼ⺻`-sMrOͫSЭ;!;;;ʃ;;m;Ѝ;✽;M;;;Y ;/;KW;;g;;; %;G;Ac;w;́;;Sn;J;;ƿ;GP;;;!;} ;;));P;;&;;I;f;;E;"c;~;; ;F; ;S;劼;; ;NԞ;;0y;q;adU;FQ7;*;"::t]O:ޛ9ޛt]O嫺"*FQ7adUq0yNԞ 劼SĻ ˻Fһ ׻ܻ~"c仛EfI컎;&P))} !컒GPƿJSnỡ߻́ݻwۻAcٻG׻ %ջһлgλ̻KWʻ/ȻY ƻûM✽Ѝmʃ!SЭ˯;-±;9;ŵ;@ӷ;;7;';Q;ނ;=;;{:;Y;;p;q;;;to;';>;Z;;8;;;j&;0;=;;)>;2;;; ;";@N;p;ҋ; ; ;;_;y;';Wq;;[;^G; ;0ؾ;a;b;;dُ;i;d;D;E#;{i;:uB_:tC9tCuB_{iE#Ddidُba0ؾ Ȼ^Gл[׻ݻWq'y_  ҋp@N" 2)>=0j&8Z>޻'ۻtoٻ׻ԻqһpлͻY˻{:ɻƻ=Ļނ»Q'7蹻@ӷŵ9-±˯;ɳ;ܵ;N;5;G;|;T;%;N;ң; ;e;;5B;*;4;;W7;һ;@;;E; ;J7;;;cQ;>;x;g;ן;^;N;L; l;"<.h<<2f<<;N;;;P;;p;$;u;;@&;9;;';3;j$;Ev;XS;/;} ;:L q:292L qǺ} /XSEvj$3'9@&˻Իuܻ$pPN2f.h" lLN^ןgx>cQJ7 绽E廦@һݻW7ۻػ4ֻ*ӻ5Bѻλe̻ ʻңǻNŻ%ûT|G5Nܵɳ;)ȵ;9; ;Z;ݞ;;D;P;a;Յ;x;;;D; O;;Ġ;R;;e;z;S5;;;gQ;;ŏ;;;A;\<<ٙ<y<`;< <V<֣<7<;;:4:94غ>dRΪTԳPͻػo!⻅}`g6Vd0qC7֣V `;yٙ\AŏgQS5ze仮R߻Ġܻٻ O׻DԻһϻxͻՅʻaȻPŻDûݞZ 9ﷻ)ȵO;*;;>;;4;]S;;B;N;^;;ԥ;4Y;;;k;;uh;gP;G>;0;&;;;;;;4\gPuh㻀kݻڻػ4Yջԥһϻ^ͻNʻBȻŻ]Sû4>*O^c;;T;R;;.;;N;;w;;z;,;;3;0;js;c;g_;Wf;w;*;r;;a;@N;o;;< <<F<î;S;^;`;0;:@:9Ϲ@0`^S>îл<߻@Nz ' 2(5|{n s % k PF o@Nar*wWfg_ớc޻jsۻ0ػ3ջһ,л;zͻwʻN;ȻŻ.ûRT^c.;9;';[; ;{d;;w;fY;;;;!;;";;9;:;[};;(;;;+{;;<(;;$;|;ޠ;;;;;;I;>;w;v;;;1;;T<@軐I廋;޻ۻػޠջ|һ$ϻ̻>ʻҌǻĻU» п@X);;;`B;W;4;\p;0K;7;g4;C;d;r;;>8;Ф;$;;^;;;I;[<)]o <+"<#'<'< (<'<'<%%<>#<: #%%'' ('B>'U&b(%#+">o }fiEdw kxf)][I^ﻴ$Ф>8r޻dۻCػg4ջ7һ0Kϻ\p̻4ɻWƻ`BĻnC;-;p;a;;K;t;;;; ;;k;;V;;;[;4;#;);k<<<;< :<9<7<5<1<+<7%<<,<{;:':8X7{5d31.4,b)F&S$Y!r SnV [īuﻎ_θ)໇Pݻڻֻӻpлͻ ʻǻ8)Żx»׿[; ;y;4;;A;;x;;YN;;7;;c;P;7;:;iZ;;x<4<<\<9 S S oj5C 9 \4xiZ:7Pc滪7߻YNܻػxջһAϻ̻4ɻyƻ Ļ[@;p;^e;M;,J;y[;];;;};;ؤ;^;3;%;4;b;;<}5B<Fx:6s2B/+x,($q!6<6n  {"rRkﻫ^:řaݻٻ{Rֻӻϻ̻ɻǻ.Ļ|;];S;O_;;; ;t;;t;T;7-;%;<;u;;OQ;Y;E;);');;z;O;bD;fY;};+;k;$<'<+<۩/<3<7<W<<;@T<_Y<9^<'c< i<3n<|6s<\w< [|<<7T`6OKJyE;@W<73۩/+'>$ QIGb * ZYpk+}fYbDOz仧')ݻ)ٻEֻ>һ9ϻ̻4ɻeƻ;;;;4[;;/;;q;R?;,;:;k;;^;;;T;<@<-E< xJ;< <>î;<:<>î >;xaE;B e1;56\!2-)%"Mp.jT\G scYUF@=iܻٻԢջ'Pһϻk˻GȻ;;(;Do;;@L;;;tq;7h;l;˾;%";g;b;"<V<*<\<]Vyn&)·䕃3w"o#hXax$[TN&HIC>95/0E,(("9$,t x` =@  nJ"bg%"˾l7htq廉ݻ@LڻֻDoӻ(л̻ɻy;!;\!;Kt; ;l;;;;;a;H;o;e;<DT@9G4x/+&%"61sIY = +%h@^nxur޻8lڻDֻ&oӻл)̻a;7;~;t;E ;;K;;;;E;;;}<DT@9G4x/+&%"61sIY = +%h@^nxur޻8lڻDֻ&oӻл)̻E;x;P;,;;R;;;;b;I ;V<*<\<]Vyn&)·䕃3w"o#hXax$[TN&HIC>95/0E,(("9$,t x` =@  nJ"bg%"˾l7htq廉ݻ@LڻֻDoӻ(л̻ɻG;k;;'P;Ԣ;;i;=;@;F;U;;Y;;c;s<<<;<@<-E< xJ;< <>î;<:<>î >;xaE;B e1;56\!2-)%"Mp.jT\G scYUF@=iܻٻԢջ'Pһϻk˻GȻ;;;;4[;;/;;q;R?;,;:;k;;^;;;T;E;);');;z;O;bD;fY;};+;k;$<'<+<۩/<3<7<W<<;@T<_Y<9^<'c< i<3n<|6s<\w< [|<<7T`6OKJyE;@W<73۩/+'>$ QIGb * ZYpk+}fYbDOz仧')ݻ)ٻEֻ>һ9ϻ̻4ɻeƻ|;];S;O_;;; ;t;;t;T;7-;%;<;u;;OQ;Y<}5B<Fx:6s2B/+x,($q!6<6n  {"rRkﻫ^:řaݻٻ{Rֻӻϻ̻ɻǻ.Ļ@;p;^e;M;,J;y[;];;;};;ؤ;^;3;%;4;b;;S S oj5C 9 \4xiZ:7Pc滪7߻YNܻػxջһAϻ̻4ɻyƻ Ļ[׿;x;8);; ;;p;;;;P;;);θ;_;;u;;;;ī<<<[< ;< :<9<7<5<1<+<7%<<,<{;:':8X7{5d31.4,b)F&S$Y!r SnV [īuﻎ_θ)໇Pݻڻֻӻpлͻ ʻǻ8)Żx»׿nC;-;p;a;;K;t;;;; ;;k;;V;;;[;4;#;);k<<<8;Ф;$;;^;;;I;[<)]o <+"<#'<'< (<'<'<%%<>#<: #%%'' ('B>'U&b(%#+">o }fiEdw kxf)][I^ﻴ$Ф>8r޻dۻCػg4ջ7һ0Kϻ\p̻4ɻWƻ`BĻ);@X; п;U;;Ҍ;>;;$;|;ޠ;;;;;;I;>;w;v;;;1;;T<@軐I廋;޻ۻػޠջ|һ$ϻ̻>ʻҌǻĻU» п@X).;9;';[; ;{d;;w;fY;;;;!;;";;9;:;[};;(;;;+{;;<(î;S;^;`;0;:@:9Ϲ@0`^S>îл<߻@Nz ' 2(5|{n s % k PF o@Nar*wWfg_ớc޻jsۻ0ػ3ջһ,л;zͻwʻN;ȻŻ.ûRT^cO;*;;>;;4;]S;;B;N;^;;ԥ;4Y;;;k;;uh;gP;G>;0;&;;;;;;4\gPuh㻀kݻڻػ4Yջԥһϻ^ͻNʻBȻŻ]Sû4>*O;)ȵ;9; ;Z;ݞ;;D;P;a;Յ;x;;;D; O;;Ġ;R;;e;z;S5;;;gQ;;ŏ;;;A;\<<ٙ<y<`;< <V<֣<7<;;:4:94غ>dRΪTԳPͻػo!⻅}`g6Vd0qC7֣V `;yٙ\AŏgQS5ze仮R߻Ġܻٻ O׻DԻһϻxͻՅʻaȻPŻDûݞZ 9ﷻ)ȵ;ɳ;ܵ;N;5;G;|;T;%;N;ң; ;e;;5B;*;4;;W7;һ;@;;E; ;J7;;;cQ;>;x;g;ן;^;N;L; l;"<.h<<2f<<;N;;;P;;p;$;u;;@&;9;;';3;j$;Ev;XS;/;} ;:L q:292L qǺ} /XSEvj$3'9@&˻Իuܻ$pPN2f.h" lLN^ןgx>cQJ7 绽E廦@һݻW7ۻػ4ֻ*ӻ5Bѻλe̻ ʻңǻNŻ%ûT|G5Nܵɳ˯;-±;9;ŵ;@ӷ;;7;';Q;ނ;=;;{:;Y;;p;q;;;to;';>;Z;;8;;;j&;0;=;;)>;2;;; ;";@N;p;ҋ; ; ;;_;y;';Wq;;[;^G; ;0ؾ;a;b;;dُ;i;d;D;E#;{i;:uB_:tC9tCuB_{iE#Ddidُba0ؾ Ȼ^Gл[׻ݻWq'y_  ҋp@N" 2)>=0j&8Z>޻'ۻtoٻ׻ԻqһpлͻY˻{:ɻƻ=Ļނ»Q'7蹻@ӷŵ9-±˯SЭ;!;;;ʃ;;m;Ѝ;✽;M;;;Y ;/;KW;;g;;; %;G;Ac;w;́;;Sn;J;;ƿ;GP;;;!;} ;;));P;;&;;I;f;;E;"c;~;; ;F; ;S;劼;; ;NԞ;;0y;q;adU;FQ7;*;"::t]O:ޛ9ޛt]O嫺"*FQ7adUq0yNԞ 劼SĻ ˻Fһ ׻ܻ~"c仛EfI컎;&P))} !컒GPƿJSnỡ߻́ݻwۻAcٻG׻ %ջһлgλ̻KWʻ/ȻY ƻûM✽Ѝmʃ!SЭͫ;O;r;sM;-;;;`;;ڼ;־;~;;;-;s;;=;];;;;';r; 9;;;;;";;W;;Jl;o;l;;W;*;X;޼; ;.;;T;x;;k;;M߹;;7+; ; 0;t;F{;b`b;IhG;!+; ;b:#:A:696A#b޺ !+IhGb`bF{t 0 7+M߹kǻͻxһTֻڻ.ݻ ߻޼ỜX*WloJlW㻪"Ỡ໙߻ݻۻ 9ڻrػ'ֻԻһл]λ=̻ʻsȻ-ƻĻ»~־ڼ⺻`-sMrOͫũ;;B;^;Ұ;e;t;xJ;"$;;o߻;;;;d;D;D";;;Ҟ;d; ;;r;;;;6;f;u;;_;;;ǭ;T;';;;k;w; a;;k?;\|;F6;c;t;;[K;0;;;>;^[;\;;jl;4T;:;3 ;ex;:K:\E4:p9p\E4KϺex3 :4Tjl\^[>寻0[K»tǻc̻F6л\|ӻk?ֻػ aڻwۻkܻݻ޻'޻T޻ǭݻ;ݻ;_ܻuۻfڻ6ٻ׻ֻջrӻѻ лdλҞ̻ʻȻD"ǻDŻdûo߻"$xJteҰ^Bũ&;`; ;;.s;+;;F;b;'$;;^;k;-;;©;`b;;i;f;;;;R;;;,;];t;xk;7@;;p;;D;;`;;%;,y;f;&;L;_;);a;y;;mٳ;w;+; ;˖;ݍ;D;t;ID^;mLG;A6/;;`7: ::(: aa9 aa( º`7A6/mLGID^tDݍ˖ +wmٳyaû)ƻ_ʻLͻ&ϻfѻ,yӻ%Իջ`ֻֻDֻֻpֻջ7@ջxkԻtӻ]һ,ѻ;ϻRλͻ˻ʻfȻiƻŻ`bû©-k^渻'$bF屻+.s `&W; <;;ը;bq;J;;T;;;F;&;;^7;kٺ;4x;;;6;M;8;;";b;4;;T;;.;;^1;@;; ;L;;\;%;5;};4;n;;2;x:;8;;;;E֤; ;P;ק;V ;4y;f;9RQ;{;; $; ;S:­:D:(:LS9LS(D­S麭 ${;9RQf4yV קP E֤8x:2Ļnǻ4ɻ}˻5̻%λ\ϻϻLϻ лϻ@ϻ^1ϻλ.ͻ̻T˻ʻ4ɻbȻ"ǻŻ8ĻM»64xkٺ^7&FTJbq;ը ;>ADōGOIK{MNOOOON{MKOIōGDA>>;73]/+~'A#ri&K k ^2!ǭݻֻ лɻBû-L#Eڦ֜"抻!ц₻2~"vo^iEbJ\&;V^dPiJVEM@ ;]/6j|1g,(nX$!GP%۩).,. 2P7ɻLϻDֻT޻Jl} Nٙ]% 9 Y!p9&*B/37{ <@CbGĞJ}MP\KRPT)pUZV|V|VZV)pUPT\KRP}MĞJbGC@{ <73B/*p9&Y! 9% ]ٙN} JlT޻DֻLϻA>ɻ»d%ڶZ;KE zs܃l9e-_ X7RL\GA(45)9\[>CF\I/O$>U[bho`Xw&~7w%m\y٩^2}» Ȼϻֻ'޻oLys frS$7).s27W<]@-E&HI= MPSsVY{[b\B]^^B]b\{[YsVSP= M&HI-E]@W<7s2.7)S$rfs yLo'޻ֻϻ Ȼ»2}^y٩\m%7w~&`Xwohb[$>U/OF\IC\[>5)9>(4TV/j*7&!7V" '+$o0Mg5U:?ERK\QW`0^dTl7zs+{YŅ\cPs͢a41g>^)ǻ\ϻ`ֻ޻l)) l`;0 /}A!F&9,u16;;@E xJN*SWƔZɱ][`̈b1dPeOeOePe1d̈b[`ɱ]ƔZW*SN xJE;@;6u19,F&A!}/ 0`; l ))l޻`ֻ\ϻ)ǻ^g>41a͢sP\cYŅ+{7zsTld`0^W\QRKE?U:Mg5$o0+ '"V7W:#k'k,s16;sA9Go vE&4,72D8R>BYDKJPUx$[J<`0dlEim_psu v7w7w vus_pmlEi0dJ<`x$[UPKJBYDR>D8724,vE&>o '#p V.h@N;&Wkܻ%Ի5̻eŻ9`+ͻRGKC-w5{zrjmc[\yUOHB](= 7`2XM-m(S#E=B<@4ϴw- I$$E(|-03J8O>;DiJPW ^e^mu)~iu둻3o󐼻û}˻,yӻwۻ*从p֣w 7i+"0a(. (5;}5BH`6O[U[XaMglRqUu=yt@|~ss~t@|=yUuRqlMgXa[[U`6OH}5B; (5.0a(+"i7w ֣p*wۻ,yӻ}˻û󐼻o򮻋3둻iu)~u^me ^WPiJ;DO>J803|-E(I$$ -wϴ4 YcCg$Y)b.3|9X?vEKIRJY`]}hpxJրpDS3 +*Y0n4ɻfѻ aڻXIҋ2f7;Q IxV#[*18o?FL1M>T2[xa#htnt`y~aoa^Z?Z?a^oa~`yttn#hxa2[>TL1MFo?81[*#xVI;Q 72fҋI컜X aڻfѻ4ɻ0nY*+ 3SDpJրxp]}h`JYIRKvEX?|93b.Y)g$CcY _+ =TjP$;).k42:=@ۏF,MT Y[bj@s|>l|ΑeErİm򷻤~nǻ&ϻػ޼f 4p {X]b(%,,d3:yUBIQ_Y`wh"oלv}Wt2i\ߊ ` `\ߊit2W}לv"owh`_YQIyUB:d3,,b(%X]{4p  f޼ỡػ&ϻnǻ~mİrEeΑ|>l|@sjb Y[T,MۏF=@2:k4.;)$PjT=+ _ѫD < #;yt$ )/4:B@2G$WN΂U3]d5mu(V\d O3hsխ!aĻLͻk?ֻ ߻ qC{N ~@`'U&-{5i=EM+V9^f.o3w~ojl1 ܏MM ܏1ljo~3w.of9^+VMEi={5-U&`'~@{N qC  ߻k?ֻLͻĻa!խs򦻶h3 O\d(Vu5md3]΂U$WN2GB@:4/ )t$y;#< D ѫI.*| ގ49r2$;)/4 ;A@HROӿV^If `oIsxU쐻і *Kٱ2_ʻ\|ӻ.ݻEN0v 5|hB>'$/X7?H2QpZ'cm#v΃ER JyG`ɫSSɫG`yJR E΃#vm'cpZ2QH?X7$/B>'h5|v 0NE.ݻ\|ӻ_ʻ2ٱK* і쐻UIsx `oIf^ӿVRO@HA ;4/;)2$r49ގ| *.IckȲP 5#-)."46;AfHPP,W7_ch\qzqg;SR7䟻Sצ82x:)ƻF6лڻ"c_ﻀd% < '108gBgK<UN^ iN*s?}䕃 e=[~QddQ~[= e䕃?}N*s iN^<UgKgB810' <% d_"c仝ڻF6л)ƻx:82Sצ7䟻RS;qgz\qch7_,WPPfHA6;"4.-)#5 PȲkcYށƜKbn std"E(Fo.4 ;A IGP`X*`iF!sG|:S?̛.8aûc̻Tֻ~y컏6V 2( (0':CM+kX1c3nB[y4E·5b;*@FQd ^j\wł@؀0i顯A 4KṼKṼ4A 顯i0؀@ł\wjd ^Q*@F>;0'v<' UWq ׻ͻ»oYF. OgX$a=kvR2\ey6E֤w0 ˻[׻${-zX6>##6Xz{-$[׻ ˻0wE֤6y\eR2v=k$agX. OF׉>6/ )"}y iet&# ݺֺXкtʺú740Siiĺʺ>HѺ/غmߺI'm>" U6 &-4fD  +寻M߹SĻ^Gлuܻ}n@M : +7DRxar7й?y0I4,Ώż!μUռ]vۼ߼··߼]vۼUռ!μΏż4,I0y?й7rxaRD7+: M n@}uܻ^GлSĻM߹寻+ Df>lv$k`VhMYD<4-&6 U >"'mIﺸmߺ/غ>HѺʺiĺSi083U#\ĺ[ʺ Ѻ^غD+{ {Wn W#*t19E.BKT^iUu\쀻ɤGߎP 劼 ȻԻo!⻋𻀀XF QI(5PBgQiaչrNU<>Vѭz żU,мtڼEp㼐Wdd0EptڼU,м żzѭ>VGQ$\Tg[[sr&2 ק˖>7+0ؾ@&˻ػ?@NsP@U$1-?N_Ero1*AXμۼg66gۼXμA*1oEr_N-?1@U$Ps@N?绋ػ@&˻0ؾ7+>˖ק2 r&[[sTg$\QGk>+f6o.&Q_6,/U on$IY,LٺUѺ}ʺúF`#JeQmw]ǐKl,🩺X&\ЮSVºXɺ,кغwTOnbσ"N*W2K:CLM ~XcNp}V ݍ^[ a9Pͻg5ܻ컦hoS+Q:nSJB \o 5xe\E/Ǽ׼[FdY  d3d3 Y Fd[׼/ǼE\e5x oB \nSJQ:+Sohg5ܻPͻ9a ^[ݍV }Npc ~XLMCK:W2N*"σbOnTwغ,кXɺSVºЮ&\X🩺,lK]ǐ/䄺k+mJ͖֛i!OS2 FǺκ}ֺߺ~>Tw+t /4=$,5> HySS_k4yD\ 0NԞbԳϻ<߻d𻎶J 7%3SD\VAk፼ ]9!μἪF-o ``-o F!μ9] ፼Ak\VSD37%J d<߻ϻԳbNԞ 0\D4ykS_yS H>5,$=4/+t Tw>~ߺ}ֺκFǺ 2SOi!֛J͖+mk/䄺qAAy鸺ɺgӺݺ`@󺁕n hW`;&0{;mLG4Tb`bqij$R&>îӗMλ5a{%&%no8NMgm`@tڼtڼ@m`MgNno8&%%{a5Mλӗ>î&Rj$iqb`b4TmLG{;0;&`hW n`@ݺgӺɺ鸺|> Cܖ-eϋs쁺Kz@rijbyZSJLwF*Z@.G',r628E@> DӋK]RGGZ?bYjƌsB|s!RCJQ|)["PʺԺRߺDu@ Rc $A6/:IhGadUdEvNSߨ54ɻݻwrr(u=VtԼԼԼԼԼԼԼԼԼԼԼԼԼԼԼԼtVu=(rrwݻ4ɻ5ߨSNEvdadUIhG:A6/ $cR@ uDRߺԺPʺ"[)|QJC!RsB|ƌsYj?bGGZ]RӋK DE@>8r62,.G' FEIEυ! ',42h94@.bGNV}_hOr:|6Gm7LrȺӺprߺ4C3Z 3 !+FQ7DXSdw^_t7t;ֻP@b) EAl]E@ ԯX9X9X9X9X9X9X9X9X9X9X9X9X9X9X9X9 ԯ@El] EAb)@P;ֻt7t_^wdXSDFQ7!+3  Z34CprߺӺȺrL7mG6:|Orh}_VN.bG4@h942, 'υ!IEEF 蹐X/4 : '0;#)0-7X> F"FO3XIakIv+e^yFRcp~?ºκ)ۺS`7ex *E#/>S N`|t慻Eǵ4ɻk⻒'bAxaqttttttttttttttttttqxabA'k4ɻǵE慻|t`S N>/E#* ex`7S)ۺκº~?pcRyF^e+IvkIa3X"FO FX>-70)#;'0:  4/XX{@QOp^ǹιչk/ݹ uYtY7ͼ [8C 6!r(4;018ͳ@IۖS^)ciru\{w4s­ ºϺb޺"{i} "0\A#TLih ?A.λ ۼ>;_A`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`e_>;ۼ A.λ? hLi#T\A0"} {i"b޺Ϻ º­sw4{\ru)ci^ۖSIͳ@184;0r(!6C [8ͼ 7YtuY k/ݹչι^ǹOp@QX{ҁcՓ똹F꣹jݩw$7ƶɽB6Ź͹mչL޹E|v] ~6X'09]CN[PKhIvDK#嫺Ǻغ~ )8 <QYQkuZխ4ɻ컜 # # # # # # # # # # # # # # # # # # # # 4ɻխuZYQkQ8 <)~ 꺳غǺ嫺#KDIvPKh[N]C90X'~6] v|EL޹mչ͹B6Źɽ7ƶw$jݩ꣹F똹Փcҁ $>&+m17)>ND5LƧS[Ddmk{v$DܚH1Uȗ iNSɹӹV߹,빥*wO ((\E4At]OuB_L q4Ӎ@ND#gͺ;H?&" 9 UxM->îĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻ>îM-x U 9&"?;HgͺD#N@Ӎ4L quB_t]OA\E4((O w*,V߹ӹSɹNi𭹘 ȗ1UHܚ$Dk{vmDd[ƧS5LND)>7m1+$>& fPV0]f*eluU}Ba(2)^ EsA)Je๸ʸӸݸ->cu b_&A(0%:FLS aap6ޛtC2ɽϹ4] s+5f@Ycy3ź-踰ݸӸʸe๸JA)Es^ )2(BaU}ulf*e0]VfPfP8V80]8f*e8l8u8U}8Ba8(8288)8^ 8Es8A)8J8e88888-8>88c9u 99b9_&9A(09%:9F9LS9 aa9p969ޛ9tC9299ɽ9994]9 ::s+:5f@:Y:cy:::3:<:o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;<:3:::cy:Y:5f@:s+:: :4]999ɽ9929tC9ޛ969p9 aa9LS9F9%:9A(09_&9b99u 9c98>8-88888e8J8A)8Es8^ 8)8828(8Ba8U}8u8l8f*e80]8V8fP89 9$>&9+9m1979)>9ND95L9ƧS9[9Dd9m9k{v9$D9ܚ9H91U9ȗ99 99i9N9S99V9,9*9w:O ::(:(:\E4:A:t]O:uB_:L q:4:Ӎ:@:N:D#:g:;H::?;&"; 9; U;x;M-;>î;;;;;;;;;;;;;;;;;;;;;>î;M-;x; U; 9;&";?;:;H:g:D#:N:@:Ӎ:4:L q:uB_:t]O:A:\E4:(:(::O :w:*9,9V99S9N9i99 99ȗ91U9H9ܚ9$D9k{v9m9Dd9[9ƧS95L9ND9)>979m19+9$>&9 99ҁ99c99Փ99F99jݩ9w$97ƶ9ɽ9B699m9L99E9|9v:] :::~6:X':0:9:]C:N:[:PKh:Iv:D::K:#:::::::~ ;;);8 <;Q;YQk;;uZ;խ;4;; < #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< <;4;խ;uZ;;YQk;Q;8 <;);;~ ;::::::#:K::D:Iv:PKh:[:N:]C:9:0:X':~6:::] :v:|9E99L9m99B69ɽ97ƶ9w$9jݩ99F99Փ99c99ҁ9X{9@Q9Op9^999k/9 9uY9t9Y97:ͼ :[8:C :6:!:r(:4;0:18:ͳ@:I:ۖS:^:)ci:ru:\:{:w4:s:::­: ::b:":{i;} ;;";0;\A;#T;Li;h; ;?;;A.;; <ۼ<>;<_;<ۼ< <;A.;;?; ;h;Li;#T;\A;0;";;} ;{i;":b:: :­:::s:w4:{:\:ru:)ci:^:ۖS:I:ͳ@:18:4;0:r(:!:6:C :[8:ͼ :7:Y9t9uY9 9k/999^9Op9@Q9X{99X9/94:: :: ::'0:;:#:):0:-7:X>: F:"FO:3X:Ia:k:Iv:+:e:^:yF:R:c:p:~?:::):S:`7:ex; ;*;E#;/;>;S N;`;|t;;E;ǵ;;4;k;;<';/;E#;*; ;ex;`7:S:):::~?:p:c:R:yF:^:e:+:Iv:k:Ia:3X:"FO: F:X>:-7:0:):#:;:'0::: : ::4:/9X99 :F:E:IE:υ!: ':,:42:h9:4@:.bG:N:V:}_:h:Or::|:6:G:m::::7:L:r:::pr:4C:3:Z; ;;3 ;!+;FQ7;D;XS;d;w;^;_;t;7;t;;;P;@<: D:ӋK:]R:GGZ:?b:Yj:ƌs:B|:s::!R:C:J:Q:|:):[:":P::R:D:u:;@ ;R;c; $;A6/;:;IhG;adU;d;Ev;;N;S;ߨ;5;4;;w;r:8:r62:,:.G':*Z@:wF:JL:S:yZ:jb:i:@r:Kz::s:eϋ:-:ܖ::C: :|>::::g:::`@::n; ;hW;`;;&;0;{;;mLG;4T;b`b;q;i;j$;R;&;>î;ӗ;M;5;a;{<%<&%î;&;R;j$;i;q;b`b;4T;mLG;{;;0;;&;`;hW; ;n;:`@:::g::::|>: :C::ܖ:-:eϋ:s::Kz:@r:i:jb:yZ:S:JL:wF:*Z@:=&Y:`:;g:n:Iv:::.:\:&y:-ݗ::Ñ:i::Ϸ:c:2p::x:U:<:V:;};W;[;;8';4~0;R:;7sE;9RQ;ID^;jl;F{;0y;dُ;3;Ϊ;f;;p;37;l6;ld<,<)!<B2<0E:Tw:;+t ;/;4;=;$;,;5;>; H;yS;S_;k;4y;D;\; 0;NԞ;b;;Գ;;<;d;;5;,;$;=;4;/;+t ;;Tw:>:~::}::F: :2::S:O:i!:֛:J͖::+m:k:/:]ǐ:K:l::,::X:&\:Ю:SV:X:,:::w:T:On:b;;;;σ;";N*;W2;K:;C;LM; ~X;c;Np;};V ;ݍ;^[; ; ;a;9;P;g5;;;h;G;Q;$\;Tg;[[s;r&;2 ;ק;˖;>;7+;;0ؾ;@&;;?;@N;s;˖;ק;2 ;r&;[[s;Tg;$\;Q;G;k>;+f6;o.;&;;Q_;6,;/U ;o;;n:$:IY:,:L:U:}::F:`#:J::e:Q:mw::::83:U#:\:[: :^:D:+{: :{:W;n; ; ;W;;#;*;t1;9;E.B;K;T;^;i;Uu;\;ɤ;Gߎ;P; ;;;劼; ;;o!;;V<ѭV<<H:/:m::I:'m:>";; ;;U;;6 ;&;-;4;<;YD;hM;V;`;$k;lv;>;f;D; ; ;+;;M߹;S;^G;u;};n@;;lv;$k;`;V;hM;YD;<;4;-;&;6 ;;U;; ;;>";'m:I::m:/:>H::i:Si::0:74::t:X:: :#:&:t::e;i;y ;;};;"; );/;6;׉>;F;. O;gX;$a;=k;v;R2;\e;y;;6;E֤;w;0;; ;[;$;;{-#<#<6;6;/; );";;};;y ;i;e;:t:&:#: ::X:t::74:O::\:!::&::n:;ve; ;;; ?; ;$;+;1;٥8;@;$G;P;|X;)b;r l;ɇv;؀;Ɇ;;H;D;ʢ;;mٳ;[K;k;F;;p;`;&< <8<&<%%;<*@F;<0<'<'<$/'<l;|;Α;e;E;r;;İ;m;~;n;&;;޼;f; ;<<4p <{<l;;|;@s;j;b; Y[;T;,M;ۏF;=@;2:;k4;.;;);$;P;j;T;=;;+ ;_; ;Y;;;c;C;g$;Y);b.;3;|9;X?;vE;K;IR;JY;`;]}h;p;x;Jր;p;D;S;;3; ;+;*;Y;0n;;4;f; a;X;I;ҋ;2f<7<;Q <T<2[T;;D;iJ;P;W; ^;e;^m;u;)~;iu;;;;3;;;;;o;;;};,y;w;*;;p;<֣;J8;03;|-;E(;I$$; ;-;w;ϴ;4;@;<;B;E=;S#;m(;XM-;`2; 7;](=;B;H;O;yU;[\;mc;j;r;z;{;5;-w;KC;G;R;;ͻ;;+;;9`;e;5;%;k;W;;&;@N;.h<V<p <<'#<<>o <D8<72<4,o <<'#<<p <V<.h<@N;;&;W;k;%;5;e;9`;;+;;ͻ;;R;G;KC;-w;5;{;z;r;j;mc;[\;yU;O;H;B;](=; 7;`2;XM-;m(;S#;E=;B;<;@;;;;W:#;k';k,;s1;6;;;sA;9G;;^;;);\;`;;l;)); ; l;`;<0< <;41;a;͢;s;P;\c;;;YŅ;;+{;7zs;Tl;d;`0^;W;\Q;RK;E;?;U:;Mg5;$o0;+; ';";V;7;;!;7&;j*;TV/;>(4;5)9;\[>;C;F\I;/O;$>U;[;b;h;o;`Xw;&;~;;;7w;%;;m;\;y٩;^;2};;; ;;;';o;;;L;y<U;/O;F\I;C;\[>;5)9;>(4;TV/;j*;7&;!;;!;GP%;۩);.,.; 2;P7;<;A;\G;L;7R; X;-_;9e;܃l;s; z;KE;;;Z;;;ڶ;;;%;;;d;;;A>;L;D;T;Jl;} ;;N;ٙ<]<% <9< <<;;;d;;;%;;;ڶ;;;Z;;;KE; z;s;܃l;9e;-_; X;7R;L;\G;A;<;P7; 2;.,.;۩);GP%;!;nX$;(;g,;j|1;]/6; ;;M@;VE;iJ;^dP;&;V;J\;Eb;^i;o;"v;2~;;!ц;;";;;֜;;Eڦ;#;;L;-;B;; ;;ǭ;;!;2;^;< ;<><A<>;<7<3<]/<+<~'HA @ A @ A HI\t>L:~AFMf͢KҎ;(::f}2 Id2ۿ?X3Ӆ xM2Lj_\\!DD4u?P(wZ퀿i4 ߋlv*--<;;[i^0"5BA^~M('JEBUUU۷oSrr2i4R*V)""vE#yJ:;;ÇTRRBGu… iDJr9jh4xbڵkSGGWx) H03[Ve___ᱛ7ofG---+#"޴iz[uWHʓIIIaiiiz m3??_XGzzSO.ٳgmɕDZ,NLL;K,auuubwlll,y ޖ'{Kpp0Ka4yJ%::{{{h4Wǎ;˗/O}9L~=O0؜H 䦦&vZZZ8((H2~ܹz̙#Vݻwꉊr \ ,p*ߟu: myr ۺy0~Ϟ=bv!yL&[ns@] ³q8_LUWy8q"gdd˗`0baN/^^hG ;t0W =vؠm$%%ׯ_%PQQ1qƹWLEEE,:3_3g۷ow*iٲe\VVVծX,޺ud{'Oƛf^tdqӧO;lʕlX$A@@:tkjjXqOO777s^^3fЉ ԩSfffJ̛7ϩa ͒ی#?^XGLL'Oϟ?]\Pg'Ȃ ?I]>}*7a7lxbjJjW3كl;e_V%W,„iii###%t:S}w7'HTUUE_oߒV%HFwrss)''ǡ>}/_.<oRWWuuuj@6mEEEQBB%%%JrBd0vJT*y欬,I{'Ay͚5v׽qF.\D DvE&婵UlqNh46 ݸ4=z.HxV[!S^w˓d6iÆ ͛aBVy\LL _Gݹ2=|xbyJŋ1| L!_Eiiwk׮ycE ߾}^pa ?8=$ &!U E(Ւq&#""F SL>z?=>!{? J=9b>|?Y=R>q|? M=|?'< >O|?|?Np <`>|?Մ; >|?x`q">|?$>|?5&>|?-̼=*>W|?E d0>{?_ L;>J{?TDK>z?[cb>;w?#܁>7t?d=>Mm?Ю̴>(b?[Ծ>N? >}6.?]|,H?>bF*Z?ӳ]>8^P?HȽwDa>̾X#*>7>"] <>°K՟Ӿ9>qdI==$rlg E#=*y(%ԁ|I4A~E'U~$ҋ ~9(Qئ~S*>ܽY~Ѯ}?ZH F}G:` @}T; W}xv:n}zj j}<Io}VA}CB}w-]t}E }Gt}`H ɍ}nM"b};Z}B 7tzu>S=]{?/==Y0}?P=M=}?[=>=~?==/~?K;<==~?m<n=+A~? R}? > }?. _y>|?顽#>{?7o;>z? "}X]>v?db>p? >we?ZJҾY>7S?- HL>V4?lh*E ?6R?EN?U>Oa?e3E"?@l+>.Y >ZFO׾^e>a `+>4pl=y2x,S]$=Q'|CI:0+Խ*{>Z=|?[==}? i==s~?Ź]==H~?j= V=~?3<=~?͆<ٲ=~?<=1~?G ;B{=C~?_Q̺=|~?˩%վ=*~? Њ=M~?Rɼ=~?~}=Ih~?[=}?5>y|?ɇ㽚>L{? @>+8x?j^9o>=r?N{>th?>X;>}LW?_->C:?&? ?By~?u>xNT? F ?옡3Y.,>kf{۶>W@9ݾ4^>3]ƥ7>7mIcs=>v$s1=kk{ %1=X}½`ѯ<~`Ţ ;Up\RZ#_*(Nbr9\8X|>e=|?ދ=5k=V*~?&= )u=~?_=2?=?0=F=F?=zc?<9=e?|.;硍=b?GBM=[?I=P?r}=@?³ļ!=j$?.=~?V=~~?;!=Ů}?ݽ- >v,|?4w5+>by?KWuY>pFt? ϓ*>k?ƾt\>[?>)@?"7?!l?d>,?>˰LP?7=nG?o10>AA>8e>zXԮW>k }3k>1u8j=vz9Fc=}*ʽ">=y~o%9<;1Ef:@;1*#;v.ż/!z2*X?!JlC;9sԹX1Tx+$92]eISŢgk3UhJh>D);Rb>٢4̻ԇpX/ڼk@Gf<\K;;0]x 0<^ƛI<ۈj9<y`N|V>*=G}?˫=ע0=JV~?x=9=y~?a=0C=Q?p=K=|?[Iz?ϧO9G>u?\󍾱>tEm?9>^?W>E?_\>}9?/9? >d9J?=H?Vn4@)?py>06>/3Syt>gvt0>+MsA=Vy:=p|J2Խz= ~@7.8= ~[Ors<{6 ,z; Ҽ*3 ?~vW"{ ;WIQr L;lbt*пL*'d<\`odcL<[ *Wtz!&#|>Uc=]'}?Ʀ=c"=l~?j=p=?lc=a=wm?j!=\ =К?5<&=9?_Q<*=?I<-=??K;s1=?\s6=?&==?mkqD=?ܷwQ=?Fh=iq?HZ=_?Tj=A|~?Rνľ=H}?/i >u{?FZ9>`v?r~s>c>o?J>a?ˈ>XK?! >&?3{3? >Fʛ?&>H?DldU87 ?YݾT>:>'Rz>4"M@}> &d:xI>"qmLRZ>i xJW={Pཤ4=}o^rw=~0=Qn6[xE?=r}=süoi=.Լ8 3==N~Ҁ>~d|G>|<1}?z{=Xg{?X<,>w?e> q?JDY> e?pj>P?:_|>$/?- ?:>BB?e>hH{j?G;?!>^J%q>*FL7ƾP>X_(6`>nXXb@)>v>z;|r=&}$t=]M~)H=*KBC<_];+T`yɻdOU!s9ҳ ]T37Ճ=E삼ݧMSףځ)%2w?,$r }H<7*uJ8*8< L<Fc<fV<ż%A<)os<Õra<~Һ=%jB>=Ԣjc?|=q{Ҩ=T.Z=\~J>Eo}>Q<4}?K=Ǡ<}~?=T<*?Wf=h<?`#=`y<?ax?/ s-Y>r?13>g?~վU>6U?%]>p7?& ? ?$>? ]>G?D(O>y?&?j>0>HѾ>}>H[v>kf)I<>t_*>Fyj=e|_=#}Sq=u~Oϩ ="`<|z豒:N8..6!b$JSҺ!޼bB2J-<#,A/X?/֋-/PE˲S$w"Nf7kS-:-Zc"&Ԃh}},>Gei<4}?>=1}<|~?=<-?h=3<6?Ɋ%=<?A3Šy?WdJM>/t?z>5j?ɾ2>SZ?з>T??" ?4?H8,?<dz>n+FW!?#dt=AO?{;,}?^ pr>5|Zݾ<>wU狦D>Fhhv*O>osK6W!>xs_=T{ϽR=}О=~^m+=ze0S Y<n닼DM;;i;;<;fr;)fPǕ;(oKjE]G\3aɻϣM ̷:6;h9\:'/I= #=C*k~ٙ>C}E>6< 2}?=Y=^nJ<~~?=&j<4.?)j=p<?x}'=Bג<?%<]2<?h<?G<?(=?XrrR={6?=#=w~?=A齀= }?K>>az?5VA>u?aw>|m?d3>^?:>F?޳?!? L2-?T>,C!?>OC/?151K#?7ųL>7*@M꾈>8Om(>GdM׃b>p}C1>\wku >zX޽= } =f~ooG=A'k<GF}y[> <-}?=H!<|~?UY=ɹ@<7-?l=a{? I6>v?3zt>^o?Na>b?侧>M? >+? t+|?T>?E ?@T>D"?GȽ6 ?7Ǿo1>8>pG -O >I``;u>uGnR A>zuV> z[-=܂|óV=D~a=&= I8~< Py :@< ~%LI<TdGW<H3m:r`m˻79F$x34c;bGJ<B@m:ỵ:qU=rݻa }*>;'}?<=z>;w~?=w<*?o=?<"w?1zf>Xnq?ZŤy,>^S?5 >5?~$ ??;?_>D#?oF;2?P{"17???+> N?ɾ9h>QO[lw'>MkbQ><tr*-J&>y2O>_{½&޼=}JO{=.~\4= \n<(*<ɡ/; K9wFq7Q!;&Hp<<]hp<CQa1;:W8B<{?bwʹ6$}Q9>Y;}?^\=+;p~?i=;O&?cs=<?[0=2<2?x?gh Y>-Is?W͎>Oi?^ǾĶ>]Y? 5>;>?ta?N?.6?~?>~C$?Gy{=?)?|+H) ? >Y5[׾1>mU3>gbs8c>CrA7'4>vwR >.{ѽv=`Z}hΖ?=U~EDQ)=|Qa׼+<ֲ:rd''} W>nW8;}?Z=꫃;yi~?BM=D;-!?x=g=<?4=y?2 XL>Jt? qz>~l?Zz>mm^?@>2:G?eQ?Su"?L 09:?>A,#?_>B^C!?On,W/v?羰vVu>*,徲H>`N>cSu>(o G0gB>^vy>[zi㽭S=|4t=az~U)>=_Do=<4n*<jq<8CŨ<7fˀ<+~;?cZкt3g;-)F<sҥD<K;5:q:[H<_ ;5 ;L-= z~x> ')}3>Rմ: }?HY=.*;c~?K=s;?4s{=e; ?{7=;?=y<8?p<<*?ic<?g{m6=B?p=Q~?=-}?C>z?@/Hc?>^nv?dm|>%o?>b?޾>N?U >3.?lP)?x>>' ?)d>vD#?. 6?zľw?H$|>FJo>^%K>l XP>tZ$`!>NkyUj8=_|c = 5~hLS=>q =pS:<*;<N)e<ҭ~&<޿;quKYٹ*n֌b,M?sи;=ES<<N<wO$!;::I*}>A7}?Q="<:3`~?=5R;?j+}=0;ׁ?U9=6;?P=:/;@?k&=>]?3✽O=M~?нü=n}?0I k>Zp{?9>3>aw?}tb8l>zq? Uʗ>f?оH>zsU?!TUl>S8?'"v ?I ?:K?>'EP{#?e4; ?1ɞ"7)?e(>=˾>fxYJ>ijW_> rh2o,>Xxh={8M/=},N}i=V=@饼֣<i=<_t  ڵ<4W *W)}>d(}?Q=#)^~?oB=*;U?}=A;́?{9= ;?)=$$;?%<;%?7d\=r?‘n=9~?pA]=}?|=|?*'>x?.ao\>s?>j?>c[?k>UbB??b?55q?Q>sDY"?y={@l?mU* ? >qc3ھ>&S4>eO{m>poBH7>?w e>2{vrϽ=}N=~"%e2=z%[=s=˖<2_Ǽu<|]ռ><638k'}>/\}?!=0K_~?F=Jw:?8 }=?;?8=T2;?&=L;?.y?PzM>Iu?@>m?%Ŧ>{`?V>J?&.֧>t'?J0?H>CZ?7'>osD|?Hj32N?!K>'뾖>L|>am|>4nRB>/u_>n\zI 9=3(}ҕEˋ=$~T7!F=qɎ߼hG=⧼= =<3ؼí9Q#}>9B}?S=ܺja~?'p=`9?{={; ?M*7=ޫw;?{=ό;?01<܉;?Wvz?@*?>v?<-u>r3p?(">e?2=۾Մ>"R? >y2?.+* ?kV?:AV?y>[^G?㍽l9?%w?t`&O>C sľ>M\]k>ikd%N>*%tT*>}y&X=5||=z~LRY=R.=.HļG=Ws˼\=YCp<I,<9Lʼ9;!;;@:}>_uGY }?=e'f~?O=w98 ?rx=%;?4=ιy;ۿ?$<V;?ak<č;`? O<[;A?`;rt;)?:Pl;}?0g;[?IK=Uz}?Tͽ>bw{?2`2>x?/md>nr?LŜ>{h?t;/>OqX?iG>}v IW?Y;P*Ծn>cW}>EhuxM`Y>A\r8C">rxP =*|6경b=3~c tl=U-]*e'==+$=swJ =<g?<WҼƠv;|9C;[<#$X<3v><^x;r:I;c3F<^;6U:}>2}?~=m~?=1:%?t=5;?w0= B;?X0y?_\kS>LSt?t>`l?Ws>|]?hr>xE? >?a9?U>BI?/|=|Ex?5"c/?]>#1>)YQm^>!d3$~d>Zp'G+>bw={ýU=}}t~=V'ҖJ=xVZ-=Ӥ }=7:r=MP;q0=I;Kp=K;΍,=ʿ;)=K:Zm~>9}|>%}?=0yw~?B==鞙:,?ro=yxX;?x+=g;?]";?Uܻ;?50 ;?|g;!?6<?}x=!z?9NC>.u? w>n?4{|^>8lb?"eY>LM?;!>^*?4? ?r>'%I ?50>tPJ~h?+j7] ?ݾ!.s>V&?%>Jax>v`co>nXۖ4>^!vE$>z*ֽ=}}.]=~Vl=VW=)_H6=!o'=a !<z;f<+d9;{ 廼%;C:-$;zG<0%ˆ9 }8'>T1}?W=P~?W=M:3?7^j=g#x;)?&=";?S<;Q?oߒ<;0?Z)<<;C?;;?ס":4;?Aq:;? Իa;?F';?imR#;?T<?7ܼO;u<?Qp&<?!%846=;?Yq5Rf=B&?_$=2e~?dؽu=}?h4>z?A_5>Hw?r5e>^q?Ԫ>BVf?\Aϳ>gS?<>4?/˒??GE?}>M?勽? ?'j#>8>=EC$;i>i\˔1z>_ki2=>2t/({ >z$=}ƛ竐=~Svnc=C">=T;A=^!;n=i;d蜥=/y;(+=7:g~>0]97\}/>:;}?*=~?=~;48?uf=ޱ;J?b#=k;?~{?W5{P(>Cjx?qyT>s?Cv>ʫi?x}Ծ>pY?0 >=?*̎>;v?@E ?2>qP'?j;IlF1 ?],\b>ft ?>2E;NܾM>:W)->hE|yE>%s 6>Cy=Ȯ|~-[#=d~jCn=$F2 G=s; |$=|hy<߳?<G;ɼY;ߴ];-=<)<i-kxp;19y;/J;<;Rc<%;ǚ< D; <D:< ;Xwƙ}><ɋ>}?r=W~9"~?=#;9?e=潔;*?"=Z;)?<;?Zy?cZE>t?Ow>l?]ɾԚ>F^?Y`>UE?t$s>ޘ?D:BW?b>Q?¡=xL ?cVs4@>t9>n2F>ERkC$>e[M>eqzE>Rxso uk=3|-='~π|x=C6O=a[-*=:(,<5P3'}a> <}?]=fM:~?G==;F8?g=m;ǖ?8$=;?p!z?[W$7>2v?ve>n?Pw>ib?86Ʋ>ݭL?>QT(?i>?1>sR ?B>辱>)z>LL¾ȋ>,AbxYT>pwoMUYI>SIw%={˽L=<}=~TV=3M:0=~2<W!9Q;R<1v;䉁c}>!87}? = :F~?=mb;4? j=z;?z'=i;?B<}U;?՚@z?"!L=+>.w?-YU>>p?f>pe?l->dR?bJt>t1?m:?\>>[@R?YY>U ?<۽RD;?Ll;eN&m> ->wFϾ>^h/e[>bmVfj3!>)v*R"={{Dݽe=}嘽=i~6fTf]=8H5=pv=-<)9<;f)}<;Ei<5<(=0;Xk=`;.J='YD;,^=+9j~>Z};> :V1}? =q:y~?y=ʀ;0?Jm=s;ː?G+=;(?~<;?YL<;D?ٽT*x?Ђ!F>ʫr?xv>"h?徔ԙ>ЧW?˞>9?_6y>I ?9Q?gR>N Y#?6K>~.0>( e>F @oݾt>=ZJ +a>,kw %>t.=}zWd;=O}#>=~yb=?!V8=bDH=6ew0F< ]; ڼ\;;h%I;ɼZZG;7<޷;ʺ }>_O:,}?9z=;|~?=-;).?Xo=7;?-=`;g?&<^;?z{?jg9>y?Nw7>Q"t?yz@d>qk?۾q>[?m>@?j27>l{?͒O3>h>G[0?<.QJ>zY6L> YW>^9꾤:>Ve>h$ヾ_)>sLF;(=y=a|ѳ7=h~Af=).f@:=TR`<869$<~i;CG޼FR;Y;U1ʼVH< ,ͼ 1</@;k8f;7<5*!a}>9j+}?=b:z~?z=Aq;>-?p=?8;ݍ?ɛ.=J;?cӿy?mU*>`u?ޞLS>ˏm?Ӿq1>_?)o h>F?8.>Q?qM>l>]d>|=V`>Nw}=.|>Ť>|2bC>yRUi>}f~(W,>QirG(=$ym =+|L½C=7~X:i=~zum9=)4~}[>T,}?=f:{~?£=:;-?*p=\;?6.=t;?<;?<&;?e/C?abԏ?B :a?yaѼ]N;?+XH܅<?W*<?_4=B^?Az=M~?(ýT=)~?0g=֧|?*=bz?)d>,ov?5C>JXo?;m̾Bu>b?9*>K?;$*H>%?/Jf>>] >>w[X>BDQ>@]$>Gy+>;N[Ⱦul>( dS>/>qT(=lx*ڻ=d9|ѽ=~hk=~ソ$8=7fzp<{3;o e;7żhj;tμF}Ɛ>c.}?>=K|~?cU=:&/??s=JIK?%`0Һ?u :?6ʼv;p?2Lx<? #MUw?e@5>6p?žc>"e?Yя>CP?Q6&@>A,?mIH/>G=>Z(^>D3>^L>hK >gھc+++>d$q- 퓚>sIҾ#n>@aYlNi1>o4c`h3>Qw^={߽=} l=@~<6=p(.o(;;}~>(.}?=}~?y#=G:/?n=.;U?Q,=K&;C?p6;?R3&fx?9{F(>'r?῾S>[g?w>E>$T?.p"Wr> 3?3{EH>>x] >/a>ka>P>ƾ2Wھ>M_%>cE)ݾxAp>X^I3>Anl>mvѯ&dh=i{Tf=ɑ}+i n=ؔ~Qp4=lMxn20}}>!?,}?y=o|~?8=E,9.?qo=#H;?`-=X;?;}<{4n;#?R<)N;?R`< ;?jGs?2[E>Oi?`z>^gW?;>,8?[B>8?2A]!>I0>ZdDj>շMU>̤]18r>>KX0r>@EWZ~q>@\͏d4>l x>v/V=}{͗=Z}}xco=ov~O4=c6MI }5>A'}?=[x~?=) 8+?q=Gg:?/=L;S?C|<];?<68;?_k<:+?<_9?;Î?%;#?sOw˄q?ҩ? R?gRp%??ZVS?A};<?8<?'F<>?L0Ck =? B=x3?Hi}=~?mBD=h}?;=|?G(=Ry? }>Et?88> k?Vtj>eZ?FK>z=??d>d?JT\>g>ryfq>iZZ8(>柾A=S>C){>B<+U"Qr>Y 85>9ku>sOuī8ko=z9.&ј=x#}Dƽq=X~5=7~〆 aD}># }?=s~?9=9'?/u=`e;Q?FV3=Z;?/ =c;?<74;?{?E馽yq=B~?|۽–= }?H =_|?A=y?]w/| >&u?yyP,>l?龥A[>o]?o>A?W?,*[$Z> >g>BٍRC s>b! ;>y7lr>Vsx5>i|G>|tGA=P-zqN =|Ͻ'9s=l9~A7=~ f$y}->_躑}?z= ,m~?&٧=:}#?x=7;?7=aw;?=$w;?l]<==;?t׆<:?><?ru?wO!>m?㾀OM>|_?>E?9i>M?iY e>2>i>g=au>xGPH>%@>o3/5r>sSľ\5>hI&>sI_=TySy=4|ؽu=~e$9=~1}[(>3}?=8j~?=: ?z=W3`;?Ž9=;?=0;o?f<L;?Y'7o?ݾV@>Sa?9nLfv>yI?7>f>l!? -X=շ>>9i> r=Qe>d5WLY>6*>.vr>Qx^˾b$6>{f蓾}>fr4 RI=Py~lk=y|xw= }><=~I+< 2loQ<k-/u;7H;"ռl<޼j|M<PLۼ0<ľ(;u*<N<%@<;<_? <Z<2)<]<d;= ;U,=LO;Xel=`Ʉ:*=0.Y=[q~(>s}>xs}?=eiJ:Dj~?=k!; ?; {=;E?9=(;»??=憖; ?$<:c;}?A[<[:?7;Q<ц?.t<5:?ӿ;G C?;rnԻt?hԺy?0Ż [?7K?׈$?]?xDIp?ؾDk5>c?uYurh>.L?A%4i>&?\Vį>>#jD>i >0g>6`/>*N Fr>\N3RҾs6>dK^#>qbZ=x|==:|y=~}b==~s$4t<*%iv&U;Oܼ;3l@G<U޼*<3(E;V@(<`B;׃<k2<:mxX}|>Ɍ9}?,@=&:n~?=AG;#?ڌx=;?{7=;E? =;?D:<V;?a<;9?3L<9?<~=?;Uxp?;3»??!+?uj0`m? &?j-<?=ݼ;!?>l s<?X7O<?lo-=5`?N=~?uȽ=D~?̙=*}?- =>{?Gi=w?t>8q?"hӾ8/e?\ \>ʔO?.1'>L-+?\T:>*3>`*j>)>7jb>0UAN>WܾY3Eؗ>5%q>Kؾ37>kOc> q[bH=e\x#2=4{%z=;}OýN>=~ܠ\<@}\<{Z;3;ۺ :;LVf;M=< #< ;J$&N}\>zJ:z&}?==V:v~?~L=Nj;)?s=Tw;Ȋ? 3=r;2?y=v;?Ҡ r?Uξ ">if?u&P>K6R?r7.>(z/?W6R9>V>iBk>ViH>/CX?>ϾW7Ȗ>k!Ϩq>H:Z߾8>a%ߣRQ> pkZ=vw))={|=}ɽ?=y~j?k<_ u%eњ}>Rǝ:q3}?K3=F";p~?=1;0?in=8;Տ?-=;?L<.;0?H<2;?ױz h?̦F>AT?H+ }>n3?O>Z>Zi#>oe>m> \;>%þ;yƕ>I uOq>;Fyq8>`|⨾C >6o s=VwT/}=&f{\}=)`}н۝@=b~A;yBq;d(<~㼗<;t:?#<|6;~<D4=<*<;<-F<+x }>':z?}?=é+;S~?^=x%;W7?3i=;?}(=wL;?.Ni?[=>fV?m(q>7?MД>&?h>o>o=o> y*_>¥n?Δ>>Wq>C9>^Ʋ >xMnz==v95ǂ={{4=5}·׽,B=I~_VC }>2 :_H}?ɯ=*;~?=&;=<??e=;?$=e;[?<;?l4vj?xl4>X?@%ؿg>X]:?MKf>  ?͗gq$>fk>cGpͯ>{ fa>Cӓ>{O p>@ X\:>W\?P >gmQ =Qv=;=z> Z=<} ޽sD=/~Y;kXr;a7<2FU<玼|;,E"_1}<>G|:M}? = ;~?=ހ;Y??Bb=ݤ;z?s "=;4?{?ƻ;d:?IQ;LZ8?n&:к? FG?Gn?#=K?T]9ۺe?"vv:?}>a;? |_<?tR2<?J8=?䢅e'=v=?^K=Ͼ~?28 r=}?,X=Kw|?uO= y?Dc =t?g>6k?[T,>Z? #e^>`=?HJy>?of͗>e> qV>tфdB>k?nFΒ>!گ fp>z>:>QQ[/ʶ& >l$=-uA\§=z=|ÇE=~z<~:4i~0.|>l8O}?=Qf:~?F#=F;@?ia=a;?ʝ =3;? !a?$2?fV=?R?ց:?ϩ;s??ڼ.4eA\?* U>#@?F>?#e_>e>טq>9IƑ>d "#%o>G:;V;>Y70&>Lk 0=JuUGon=2z&ą=|WBG=Q}Ѿ,<~b;O"=ܓ;q<h=2;=A:+y<=s$d~>S:|>ƀO}?=gr8~?=t:JA?a=g;? =]/;J?V<` ;5?b<?>x>>=cW?99=l~? hٽ%Y=1~?_}=|?9FF=Qsz?V=0u?hWn=Um?`]`y>]?MM>#B?ZGDr>?ci>p>q>.=0h>`L|ϐ>hi4'_o>g8!G;> X:a>jZ=tLnά=y2r=w||^`I=}ý=ײ~aZ<FS^;˩'U; ? $;K*<>!3;Y5J;R)<,M;<< /<<S<I<~@v<%x,|iL>TL}?ڎ=_~?=-!:=@?a=;?F =([O;H? <jJ;o?Εh?ػꈻ?Q#`Lh?\?mb:?9y ;~?oμ,)a<?e{ -_?G3F> E?Av{>?Hb>>5r>4l=0jҠ>szO>% _*n>(5Tk;>WVj$ľ>Ϻit=%= D;j=m;f=j ;m)7=Co3:a~WI>L|>KdRG}?=ɼ~?+=)=?c=Gt:c?r"=y:I?Oۃ`?d>>qSG??s> O?`>W>rR>8=l;3>cRh>>I@v~-D"n> 2~w;> T:Ⱦ>7hk=PsW=QGy2$n=|g O=}xzν=~,<"4tjg <.,;8 <>YP)<Y2;f;5IdMd2<;D<(@:|$>Z?}?`=ct~?OQ=:EC9?0g=%?X%=(9j? T=}b}?w9z=kE{?{}X=_@w?ruw{=po?(G} >a?$7>nuI?#=4l>!?-_>V>q>=n>LM]U%>a0m>/[p x;>R̾U>^gC=Z0s\=x(ԍ=W{* 2S=}}Eӽ#=}~1Ì<**'(l4N:|*>@һ6}?$=T~?< =x4?4j=@K?(=>:?r<Ą/?h<)$X?pY<ڛw?c ϒ=w?n8;=:gp?B^㾏>c?d0>TK?B;d>$?щ]>˗>mq}>K>b|o>>7Wԍ>3UNm>,| y;> QnAѾ>f] =rDan=xu+ა=ڸ{TX=^}VؽS =i~8Z<hq!<30i4+<˽A<A0R<ѡ@<Î 4;iMZ3;h|L>1 ^.}?ʍ=G}~?k=wq0?q%m=a?B+=f%?< "?f<9.?;-a?1;$i? vT?<]?:?м<?6<?{,jM<)?_=<?8a=11?^R$=~?ߦ;=}?T1![={?+sAv=%x?(D = q?|k߾e+=9d?çp)>mM?9\>-(?[m>>/;q >5>pB>G!kZ>&⾙Y6m> )k;>BOuվ=_>eu`y=!r%@f=axi/=I{ h_=@}ݽ%=V~ҨB`<U:vd8a<<%,**;I}>/(}?͏= x~?d= ,?9n=껩?-=)޻?ۀO?6T>5+?Y$Zא>n>2pz>3>qq> \n>Hپ9Jm>&6F*<>sMtپ:>>dK+=lqj=5xf2{|=\{yz g=R!}Ƚ὜O=A~8Ȱ<I{GY<~Gp;K-b;}m/>vTa!}?NQ=Ss~? =<(?fp=+s?^.=n&4?<2?e<Ia?Ol?_|ּm;?ʧB<?&+3z<:?YZD\/oQ?$y4@\L>@:.?fX^>k>79pϑ>H>\r>'^`ҋ>8JѾz;^tm>#̸K=>Kݾ>)c*&=q~n8=w5蛜=,{ s=_}wf&=,~/#<~yUA6{ <}0p>nבD}?H=;Km~?=- #?sdq=r?R/=?2o?s<;y|?5;B?c;.?';Lᶼu?.`V۾1?jW¼?ֳ ?FS?B₼R?+q?쯵s x?ټ94?L ;?+w<?Y\<?6M< T?e<~? ! =2 ~?~%r&=k|?c_T=>y?={"s?Ӿ;=6g?O u>OS?b82+jC>)1?Vv>%U>XoЊ>s\>Os6>ĽK`}Y>ԲȾ#6>]n>C tIK>bg=p]r/@=ww8 =|zS=*|q轮7=8~m~KE<}o>G}?=e~?=Y$?q=?/=ѵ?mO}Loo? Rܼ ػ?t:?a+<?W<?z5K<%a?%/<~?;콩<,~?%! =ՠ|?W^7=y?cwg=)s?Ͼ_=h? >x!U?/h9>T4?lTm><?n,>q>tt_>ɛbt>6y@p>=7pB>#Go>aAd=ov=0w<;=zۋ=|'M=}# =~|]ӌ<}y>6}?|=/tX~?=K?q=s?/=&?s6?S'Od>`8?!n>U>yu@9>Cghd4>ηB6s>HF>2#F2辻>E`>FoAzU=v=7e={~z=!|p콕j=} 鳽L+=~Ȃr =TܚAk=!=-b=h=oC =<=nL;98=?J<\6=^˕<$=kX<}M>ʪ(|?=&:A~?=)"~?Bq=, [?'/=a 0? @<%?<<+?r<3?~#{]?%Rb:Xw?ɪ;?Ў <)g~?M׹t<}?UFv9?7QӖY>%! ?Am$ԃ>> u>"f>pVDmZw>%yM>@D;2뾙&>m_UL#' > nvI}$=\FvS?N=+z1д=yL|=} U=j~`6=9"@605=wU4^D=5K=]>=m17=S}M=^; g=ݕI<}{d=T<Q=w"<"G=]E<F=f<[A=<+:=;h<3=]C =`-=+ua=y9%=f =*=9==CM~>$=|>\ec{|?Q= a~?b=g^~?8q=F\B/?:X/=l1]6d?e<`a}??P;:{|?Jʃ+x?k愽qu?[փJv?0>!,{?t?c?J?ۼe8)}?p99?'"ͼ?OO? ƒ^u? ?:߽x~?S\4&}?Q@.s8uOL> ?elg~><>*uЍ>jg>xFǫ}>sV>9PB1>Q^ a>|mn;>uC@$=(yK={ l=5j}Ug=q~a/s=@ =T4s=Fb`=Lb=I`D}=XbQ v=̺J=r;& =UD<=iU?=W=5~=O=&~6>J=|>-p|?==뙽}?e=/7w~?xp=~?@.=K?<漚&?y<\f,?[r<'*?AR%i>?M~a=>g?k~t>>*?v^><:hW> ̞Ge>@c>G@Z\/t'>lo>tA>^y+=h{΀,=|1=~[=~w:=~^=~9TX=~y|=7$ߥ=Y$F9= ;p=~a>=|7>ӽ{?X=Aӽ}?5B=HѽQ}?do=l8ѽ8~?|-=ҽk~?rA?j g>>Ryv> 1<٭iZ>IP>8 v> >T(X>BR[P#o@>Ȃką/>sAz!>Bx?c>z<>+|3=[}| =Y~O7x= :~ U3=0~]=&~ ==J~-=,l~7 =K~;=~< 8<=~\=?{B>[z?L =u{?m=8|?n=a|?,= 0.}?<?}?em?Ij W>>bvD>4%=h>j~>r1I>M [ >og;K'x>Y c>i؇T>rSAG>=cvGr :>x&*>szͯ;>|Bz>|4&>|? >| >|Ds>|.;>>5}5J>>|;$>|3<#>ٰ|'|ڮ<ă>|M<>|}v<>7} <>e"}?=v?>}\=>}՜=q>|D= > |S>Be >!z# End: Data binary 4 # End: Segment 3-3.11.1/test/testdata/oommf_ovf1_binary4.omf000066400000000000000000000072321503346766200207520ustar00rootroot00000000000000# OOMMF: rectangular mesh v1.0 # Segment count: 1 # Begin: Segment # Begin: Header # Title: /home/arne/wd/les/ingenieursproject/oommf/test.field0000-pass00000.omf # Desc: Field Index: 0 # Desc: Applied field (T): 0 0 0 # Desc: Iteration: 0 # Desc: Time (s): 0 # Desc: |m x h|: 0.39767182754665648 # Desc: User Comment: # meshtype: rectangular # meshunit: m # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xnodes: 25 # ynodes: 10 # znodes: 1 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 5e-07 # ymax: 2e-07 # zmax: 2e-08 # valueunit: A/m # valuemultiplier: 800000 # ValueRangeMinMag: 1e-08 # ValueRangeMaxMag: 1 # End: Header # Begin: Data Binary 4 I8|-+=^'R6X2ȱ?#R?8Z> \Rޘg> Oƾ`yZ=Կx#>3??;*;0?zN3.?tH>\>?Do?=P??wjo >IZu!>X0C> ?ve>gbl> ޽|Y?gךֻ?[Ϊ8>PzUt?Kӿ/>bi >2خn/1#,>UQ>Sb?Znˏh4?V?v:νVbӞeNſG?ZѾ<ǿ&BL?e/>`߼>|">g¾Dk<)? FV!?_>g=E?t۾y3&%l>^ fyahR=PH =M>›fh_@_ЅU?%@g?:ofk6?0>?KV= ?q?Nf=O#?ܾ?RA>I?ql@0[>l> h8=Ѣ>޿|=Y=\?Yk?ltwA>Ժ˿OeB3%>ij ?1'y?'¦G9.?) 6|)9;b >06(?Yn?[Y6>X?Lv3 ~>bdݖ=?L_>̿&)0f"O<?JV>K"?7"m?s tX?틿d%WG|?oD1Z=L@?{K?G"?o>쒿a )%?I? c>>+Ŀ?hf~?cs>#?i@>ӱy?xʽiW4?:={?.P?Q>P?A>hR4^z,?a">۾da.$<>* ?3pϾ̿.'!ֿ:MT>?o >+οNa땿Jo>hm k>?$B? =?Yfi>لt> X? ? d>n>W{?w?-^>4Kÿ6El0=Ͳf?~F|>a>!?]?LMs=_=k?_;ԽiD?b>hA߾GDB>F!?&Yl>̍+?%Q?AE?!>%?HJwB?tqԻԹ£,?h:H@>od>4?_B3>/ cy$>?p>T?x]}?zGgv5>6=>??n*u?vfkR >9hm>ki2]?5y>2iڛ>e L>\L?ZF?>?'4;>D>V2p>͆?%"? &6>: 8@?o>? <@T?/2?8x=Q>$IFZ >y eh>= {?v=?Q|?(? ??:?A) Ծ%?$^qW1vf>G6=fb?tpa?ys>\ED=-m?x|=R~0&>q 2K2|W?>+[þ R;ۅ?ǿؿ?8m,?Hܾ]>@] V>j 3?g)ښ=E`i>Wifo ?U>?6@>͡?_?  }m>4׬,> ݿ2?4m7&#bD!?u=ϔ>? ?9k=`+?v>==m ?o&?![4W۾=L?cx:`~0FǾi?g?+B>nȿu5=|G`>7RQ? X>O?8>?@_?.I>pֿY,Դ8.y?sLyP6t >WQb(z=>4? >._?0D>冿?.&n>|?(,??&>Ὰ΋'j?f0߿o]s:M?+ >wJq>ɾ=(Tc;,?Xԫ>J>ܢb 2>>?$c'/??q=F ʿ7N?4|e(ο67>I>s?jy"󁴾}XX?Z?0>ȸ3?UŔx8>2B?$bX^?t>"2}z=ەG}?Zx>B>H*:?}Z>? 3^?0L`?lJ>>D>q0?dZ>$xI?Z>H|sfof>pKQ?x;?/{8}ӿ2?6"g-?vc3< ߾&>1>"rO+ >GHt1p ?a4?)?"a>˿ 很?J`?wMx\>C>0x!?to9p>W=i?&ʿ=>'R>t7ֿ(X?d'o?+<?dxh>T7?.PfP<ƿr >cNF\>.W߽3()b??ļ?kZd?\j>:>i UZRC!Fɾ,a?hΊ[WYh߾L!=Xb> />?I?5 A?T(>4?.>N>?N=?VOZs>5*z\{x?vW1>@WJ?rԾ4Xl>?>x-?6R7/\.pO9^iD? 6 # End: Data Binary 4 # End: Segment 3-3.11.1/test/testdata/oommf_ovf1_binary8.omf000066400000000000000000001666771503346766200210010ustar00rootroot00000000000000# OOMMF: rectangular mesh v1.0 # Segment count: 1 # Begin: Segment # Begin: Header # Title: Oxs_TimeDriver::Magnetization # Desc: Oxs vector field output # Desc: MIF source file: /home/syukri/workspace/oommf/std4.mif # Desc: Iteration: 5, State id: 20570 # Desc: Stage: 0, Stage iteration: 5 # Desc: Stage simulation time: 5.7528e-12 s # Desc: Total simulation time: 5.7528e-12 s # meshtype: rectangular # meshunit: m # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xnodes: 100 # ynodes: 25 # znodes: 1 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 1.9999999999999999e-06 # ymax: 4.9999999999999998e-07 # zmax: 2e-08 # valueunit: A/m # valuemultiplier: 1 # ValueRangeMinMag: 799999.99999999977 # ValueRangeMaxMag: 800000.00000000023 # End: Header # Begin: Data Binary 8 B!w@!VL!.ښy@xS&$axBsD' 6@L+%v]>Do|k@H&)3}xG@9Zv&Cyl@~d&(n[,@sZ&y<ȃߊ@T@jl' 0kIJq@?']\ RF:@$ ' L[t@p"7(6_w2A1(fP۽dmJ(i+g@]2AO|_0/6(gc2X@Ԛqt셥 \ҝc(h2)b@vDf](hH@ohMvR(in,l@YaLѨ[ [V(i]l@b))HF(i@غl7D5L(ivQ@ܼAIU(iࢨTG{K^$[v(i*XZ6=;9Wo(i\/jUb=P"2b(iIo:/^s(i$޵=sX bl*(iӑP" ?(iOoHz@o |3(it;$T7T w?(iޫ vȷ$$!j(iv7?`SCw(iScV2(;nK(i&pZ԰(iۿЉ8&Y(iLu)!$g(iޘ jaH|$ZYY<(iDT2Vz4,|'mm(iݳb|V+`f۳(i@ &.$W(i}Qy42.v R(iڜ?ݑpΕ5m,R(i3.5"f59O(i߇xc}WW=Dt7(iWECZA((i"T{(}ED9e(igzƱKU#I(i>}6NnA(i$Լ@W>Մn[Xܨ(iОKN@uE2w+cOf(iXSL@ӿU(i߉{:@Qe@6T(i8.@ O]vtAƭ(iυd@O-P(ik@0(inq#-@x>ۑ3 (ie@myݥd(i I@y"m(i4'@0:Qgqd(i@X(7:/(ic]@3~}FSO(i-B|g@Z ,|,(h^0@WQˏ(h@o[a#ν(hWAǪ @'hϳ U W(h#@f6}hə(gI醾@ &G`(h`@@`2_DK(h7]@՝d!(h᠆@g>-s ؾ((ip@ȔVd2XlfF(if@# tA(h(i\4@b ")n(iaU2{gˤ4(f )k)'dّ(][)(JvRl(:a>8!>7@ l!-'8 W0P,$e"Reà "@mRp@[:Gl( |?@ݻA$Ү+gz@]A'TYM!Ot@W A(;0glfŽ-@gvEA(\A`@pjFA(h;rl.g+@ՂF`A(h~Ts@Lls@z>A(\L@Ј9Գ@\A(D)B@9en@LA('ٻ8r@W8uT@&MGA( s A"@/GlNA' IEuAhWOoA'x6A ^a;^A'0LAtA&r3A1X5 Q6A${\AUSJD6A0(isA$'r5ҳ2Nywܺ[$u&@ӈY)!I!V5!;s.@ʝ#c*"#4}i$# '@ 8$Jԡ$.kA@řzU9_$SD`ᗩ}@6_cM%V,|s@K C%2M،4F@N+G%שeh/@%Ia&ptD};Ì̚@#l'De eE @æH,(—0W|@4z(h\'>Ve0V(ey#@Fnj:Sd(ivEV2D$(i#WH8@(i}ш6lL *^(iqqڠP*q(ikb 'Tp=(iT_%]"X (iA( q3]fV(iߢ;Bb~(i64UhhyS(iG~[ZOm잕(i6]ht;_ɕ(iɇyJǒ{S!(ivcha^0nrO(iUi;@Ff”v$I(iw@s'.0@(i‚@XN0(i$@>R(iݨ N@뻲7ak(iZi@k\Ni욋zt(iױ1@aO'(iܡ@oe6c(iȐR@ׁƳ(i[@ 9{Ftu(ig%@FyzN(i~J 5@0Uz>(iH3k(@9id/f0(hq@!]x#Zܐ(b(h]2@ύǣ"{ٲ (h:T@Қq"Dkn ](gv@UBL}~5!(f@״4%(fѵt@q(G J(fJ&@FD(9QW(f`c?@'"w(f1 @@Th3ۄݟ@(gs(@(V]`Dm(hi@Z!XsN(i ax@wˇ)p(ii$L{(geIS1,#NVgbq(\wz(/X(3NJje 'y1 Iiٳ+)e#d#:@5h_ aq&@K@!rzD@;섾w(dܹ@'RA۟K%oH|@ơ}RA$ wX@۟CyA'.e :52qNf@͚זeg&A(/Um>_@÷ў9A(_ɤ> u@嚀1żA(hP"@˩wm@L0A(O(@~C@9fpA(" !@Q(nR:@drwA'֗}amAkCYoIA' \nA d-]\3=AsA'P<A W2-^BA'^A.'dx/A&'zA"夗E!D2A&&l&AtV8YA$yЍ1AoA!¯j7A `$Ӟ"DAA%򴽛 ?2'AoA'ӣ#B3DLka71J%w:h@RXBxx#\  @{tD_ Ϝ!)q6@︱w"L4-1! (*A!@Wo#ÿW9#H@ȕ#EA$C@π$!ds~*@ƐHU$N'I#2x@ĝ՜%ϫŨ}h@ D&& k@oS:(O؈N,z^q(if\%fK%(J,(]Px @褘r,z%< ĵ(ZKG@`C/zw (^@-5aM _(cJ@ŷpdg~/K(g s@X{R (h9@Xdלfk(in"[ɂ@N_#1t(i"l@fha`y+(itU@2)bWQ(i:ld/My(i}Bz@E#(i|\Qp.Bc2(i=Sf(ib˹]JIXL8(ik!Iܾ{0(iL\e xKq#X(ifGpMy(i׿Ű%Q/(iؓH"w(V#F5#"(i]Zr B(i6@^B\n(ił3+uWrb>(ic1<"<.fg (iODcj(i}^8n2)(iX /ڀr/r(iwu #~"vʨw}(i릆+X{zm(iZ8(Yg$õ(i¶FYID(i$u W;(i|9ViMuk,(iC ;Q])r(isQKЖ '̋(i4z1e;%9:(iG4i)lP[(i9w@CeYS(im@s :^K`6(i߄*@K}t*e(iޱf>@9Zna(i^:e@!%]|(iG,@,H ɱ(i^@n-Э(ie@<|Hy]7(iʝ@"=7;D#(io@4:{Bb,Ep<(i=X@tԑw\/J(ieF@5ABш0(iQRh@Ĉےf} =(i.@{|b (hx@e'L(g9@U͹4`Ar(g%ߙ~m@iъ; h(fEm8 @X`;W1w(ei@U4g`ê;(d7@|Eu dB(dmP@7Nr)=0˿(dIA=)@ЏFPp(em@Gza](fȒ+T@din dDr(htz2n@|Nz6:a(iņ@}@۾hA (o'ֳ^Y@YhUAכT$bqlY@f2A$㧢Ķ:Zȝ@58-^A'uRG E>t@)W A(;b_=d*@7yA(im@7@G9A(=al0@K'p@8lTA'P}?Asss VA'O銝A [#(pXA&LmQvAwaAyEARA&Bt%A tf.&[A%nA+Lf`A$ݻ6AJ\= WlA",Am]neФ0yEAM A#6!ޥ2rJVAN|a@A&jg]v!!AH~0A'._PT@mpIA(L*"<2aha&My@֕O˸$l@\Y)J3 #Ae1Q_@b/Xƪ cL)"f@c!T!0WS[AK@? K"Ǭ= s.I )@]^" cA<@x(#' .w>J`R@3q$htWlJi@qp&K !@Ks(-xw_xg(.(gQzp@ͼ fV;JFm(Q0X@U1Riov((N+3@cpv:(W_/¹'@ m a# (_@A j(dJ@bH6;q(g X@ԴBɡ;UJ{(hg$@ n>!N(i#I@*ٴ4 ck(i @JEiɪ(iݼ|@Q}!UIvx9!(i-)⌧t~(ijTAKW?9 b(iڻnNV@w^&r(iIrwl=F`B(iIe+h= (iHL]XΔALfgF(iد&g Lu Ci((iE|YV m'0J#(ijFs&Sp/((iډDVGt%s(ibl(v7xg&(i۩g-!dt|emL(i'pJP:Qz(iܝ*5iF"[AGet(i :wlC(ivɱ/5K'}-(i,l5Z lYW(i=7XF}>((iޚ%y3ɿA˜hu (bTN@ M(c_O@0/bq(dMk@eC@qȪ(f_  @ZiGx(hX@͚@Ra30w(i׌SS"}XU(d'tRhf(H, ^Iރ"Og'tT! xU..<%wR+͂f@6#",*@ֵANp&u3@%$;|c'I@6:7nm(_E#@02~@h+ (Fn@}YA '?E^ %@?9tAǿ#c@Pfi%A%#c!@| DA'q-#@@EA(e h|Z+@>A(!s@D#M&q0A'PA ׏4ĮѵA&_6AbkF3A%gT8 A{`u.2N&A$F:C@A1]WǍ6Y43A"LAlEksA(R?X3"/sTa@SA(bDblgx&( &R~K@y|roݳ %'@ոEvq>$}8@)PyOfԍ#4"#@f"D+`֧F"g8@ѷ1 x Dp+!/q@б)8rY!{ZC_!|ׁ@ Tد"'S0^1@Ub$N_{~I,.@Ņ 3Щ&а^`WI1^@P\(H@(Ġ(M(^ m@Ea(B 5 t@LƬt(AyUU7@#P(Nn@:oJƣj}(ZY]K@i Jc^(b/j@oz8,k(fD7#@Y)Rs (hZ U@a,`(iLVT@тN@8q(i t@~lb/d(iդTRF@#Mf,g_P(i1!ѥ@K!vsm(iz/䘑]5(iݑ%kt| <{햺(i䱘Z(E휨-(i:fP ")(iڵx[OKZ,6_(i6knF)3(i,hl!ݱbI(i۔bpASL1c(i~(iR@sؙ;Gv(i`@f Hv%(ip@2*Xz|'(hTȗ@ϬlK&Q{(gvk@"o:(f@cEN/ź (et@/E$(cG=@^ȯ{$(bwyju@ꓤx_~w>(a\#@I!t fH(`O@b˷abM(a]wY@Y\#C+(c`O@i?ZMf(f~zu@ق愤-; z(iZ]b1@ :tJr&(hSZ4[=-0M q(WfK@Z,O(VjydAxBo&uv@ND1u"7g tKm@'xDZl6B0)=%,^$m@V+ia'@_m6's}@}PƢLب(Gz@ںi2@^v_(ewGS@ۺB@(&Ԯ@!nA- ||K&E#@鯲R_A57Bb"#d@IA/A%̞d/@L_A(5'y@$Mq@ɜ ]sA'vA s'>CL[A&A(pA/CWڜ#+A$}nA 69ƯVfA"ZkЁA]2qA 9=n2A"<pzA;ِe,A$pg_բ ZiAWF!A&pT{JFA,x3PA'íaq'1œ@}&^A(9R?Ni1y@'RA(];DwHϛw@MXA(enr5O+jZ2.$X8A(f5&B kq)'7o @?8&G˚Ta@ּPZf9%]~@zh=<_ Bw+$tX@Py q$#5: @:@0ʍ"n:Z=@@Rz 80!]t@ЧnK"5]s Au@kO9$| 6@DկK'R;bp&@źw (_i掟!} (Qw腞@/:y a]?(1;N@0 @(4VH@sq(D@1fZ7(TZs*4@$RP( `Hy5(^@6t3 <|Ѵ(dS@~?_(gG@*KPsS³v(h j@*yhQ&(i6@+j[3)(iƉ3@xpj<(iqv@QzT2ϧ(iߠ<@kw9`jiU(ik虫8nZZ(iG" !(:X&tu(ihpy[0y(iMxTG4(i*.Ȩm:(i*655Dai@(i3hnچxR[vj(iin5Bv08(iݟ$( , ϋ(i~x*ΌQ@ݐ(ijsQ+*~v(i3ih'6(icUb4TO"'Đ(iޓ1n TV駯(iĪNIgH u(iL:1&3(i'cFQ3&(iWzr> W(i߄Ld-ĘLcI(i߫J? ni(iͯy̆öaT(iK1xo-+4=k\(i%GFP{BMKʆ@(i߻3=@aaSEp zv(i{Z@wjKFp(i /a@]1ZG(iR2 @%Y{.(i3t@~0Uw](ivA@Kh3 ?Wt(iA@;7 4_GB(ir=7@5 D5=x(iwX@QT(i|@jjv,(iz@޽ޤ:q(i9@}VM(iVs@@U;m8L~(hG@ lOb .(hi@еM%JFDZ(g,@Zxd(fT˄8@4Sr`(dlOS@lp-c&(bK@JXjT Ԋ(a0.@|ʌV|濩s(_L$3@ D(_L@B?QRyl(`eH@$\(cj m@ᨒ'2E?2t(g0YS@ֹ n/ŧ%(iދ@Z?;](cAC.XO ?F|̊(2{ǒP!@-I"L( @ &w%,(Iby@ZG(fy@ۄS@ͬ(6gAZ@^:uAOZ!&=@fBwcA:"oEm@eHYA&{r@[A'GɐAsGǎfA#JAF̃gQA >k[A"70uEqlapAoA$bՐgA^aTA&үJ> fA,YfA'[u#]5}@>&NA(2;WV}OJ@NdVA(\ TXR@sf A(ev]w`݂A(ee\eTtNA(d&103h0)Hyy*A(c<= <'x.<=i@ؖmq).r& _@vS/H&|>%A;@ghW%9!N @U;far$v@;HJ' q#UY@mn 썌"]VFBR@G"VQ"1a g" @m$O$˖LvC-@П'!?{ &p C@v}oC(iY?{ccCVt& (AIuA@>w+ٸ("Uid@h-R#+(']3@[/JI(;tw@$bpyt,;(M/%b@btӬ([z@؁>=,m (b@3:u7(fDG@@Ā V(s(hx C@b:AI[:(iTة<@K @y9(iT@QR(i'@vEC̓*(i$,Vl@t (iߨtQa@`JN vixGO(iߜߑ7U~3BV(i"]O3ؠ-بn(iށdXM%R(iއxkSnܴr 8(iޕa?,s1(iޥ3A@(i޶­D]}(iȊmM _v4`!(i۫oIJLښb#VI(i]]` u,T(ibI6(i"LIds^4(i?J2Oo̐>!J(i]85luQ(i{IpLR'(iߘ-u#tmd8(i߲Q.}A q᫑(i. u.mh(i͗hݭcs(iƜ8Q;< 5(iߩR@czۺ5(im˭@w&(4Q7\(i@%EZtw(i^5j@Z`3 b(iW@$S5w:(i۽]Q@cTd?<'l(i3.W@첫T 2(i%@u2"/61Q(iTtP@Rs%(i@ =ww}(i4~e@tNDbt(i:M@ ÕЁ4'(iRs@`ٲ)AͣrM(h @ʥu2Y2n}΀(hQY⻷@@X<= !(g[`M@Xa*y~_(eY3&@aJ=bA("WS(*@9MlA([SC8{<@a3A(f6eب3A(b\õ۝)8|A(]PۨAN%wbA(Z$l/ە"td:A(Y2~[vǾ|f f4"'|n@r"\'%1&X]M@=-?S{k%+ejV@63⯀FJə$Q2@ԟy9q-#t@mMw"c,@{ap"IZ + @y %_;@µ r%-'?@yx(fde7|@ OBĴV(/2@xIK)4g<<(O'LQAr¶5D("@x@<"Zؑn(1[@^:(Gk*@eG:#(WJljH@*G(H+[f)(`lªyL@_&r(eoSS@@*ze} (g]}@_~xbQp(i[ñ@lܹ_(i@_B&Շ5SW(i`a@̠gtw<,gQ(i׎O@@G"-m&(iC@E6f(i߂H%@p,[Rm6)(i߾^iQc"Ӣģ(iߦ.Y|2 ʚ:(i߈$MjXWŷ 5(it *@ʺjCKl5(ihG @ ,Ld F(ia={FFbpSֻ(i\sf \+(iYnəfHY(iWWmqAVչ|(iXqwC<֑(iZ۷[r.Yc:(i`5qM}b,(iiʔAg(iu1}>e2(i߄QQj9y'È:^(iߕEP{ Ťi(iߦ~dr-EZ,h(i߼g5OCOI(iߣN7@as]~(iqp@t/x䖃(i@$7p(iޏo@&U-@`(iݰsn@](7}(iM@9+u, C(i~@8ySl(i]@=Ϻ2C:Bi(i_@DS>N1 (i4@M쯴s\(i7@cW|s\U(i1(`@ty(iT=D@<0B Ⱥn(h@h-i(hFAo@~'HLT(g=^u@|Ŧ5RK%(eU@9IqtS((cv@._K cZ(atj;@+Dqn ߞ(_Bje@槤g8"E(]vN@!ڼ&=Z(]Y@jIhtiM(`Tג@z4 N$(d @sMQB L(i#@jSpBB(d@BJ_ĥ:2GHq(*ׇXjOj'g=-}@r$'[~ Pף@Ƚ}hW8W A96f"4.@jƚ$|K:@(9T%@պ%SxC7&3w@ָ旟&$Ň@S. &t}lM&8@׳JBU$;H7&A+;@,@0&{s@.{a.t&š@ح-WZB}&kK@V?V02Rc%?CD@ݔ06" iNC@ψ8Z%lPABnG/gʆA%IW❶#Y q:!A'AԨ/bYߑs{냷A'a^Eت؎?A'ۚk2/.8iY\A(''c!hA(! rvYiA(,c-^SۻW(}T?A(6-ۣI+YA(AEooۃSw[F#A(L%MJ\2|axzA(W;J+ zh'Өh@ c_ kh;'0aC@>~ԟ@ƣ?&7H:2@]a j%力z;@Vqqo橹%th5@ v_xD͚R<$ʔ@ӑδ&  C"h8O@WA!"͔tB!@b~@˴%s&"Fa@Ye(7+(Z`'T@`iΎ}( F,2@^e/蜠 (A SpDҏЙ(IAN ۱-()pL@sO|(AX!@&+Z{u$U(SP&@'́=;(^@Ѓqs (d3~υ@bo4-(gIS)@Jd7AH(h@87\%N^(iiĒZE@D'Mz+Fx(i@F-M(iΡ_@~jV(iٴ_@@a I_")(iݶF@ #9 ~fD(i#qx@jaG(iߌ@i=j[GIi"9(i߰ @)9Nd(i߹ ^d׵ (iߺ_jYx*asԉПJ(i߸\Tqfb)$(i߳WtG(&-(i߮kxĎA7R(iߨQzͶ ̃M (iߢb\} @y.|*(iߝjh~2*c,iWq(iߚ-1y7t~ɇ7K(iߙAe Lh(iߚp7Ƀdf(iߟR~T|2!(iߥ_|)9 `/(i߭vylP(iߵeu AMOM(i߻Gp-1f[(i߽cӛՄ((i߸)p E6;8ٓd(iߦL։@WZ]R(i߁0@o'(iA@{MY8꧍g(i֘@0Jzpv(i'@:c!GTFN(i @0Q@S(i-l6@ A]e(ir@ ?+Gw|(isd@x|F I(iȇ;@![CkeU r(ia@8W3 aF6(i@M@0, (i[Z8]@ý̺Y ;jp(hf@lB(hGkjo@w->IXPf(g3G@֨4Һ"+(eM@ܪ4txJPXŵ(c|@y(a.w@W3vQsL(^@f]G~ThVF%or@ՕUEe'M2-%aJ@ݡ~z 7L%}@&3P^ %p!vp@ձRwUb7 M%Kf@7݆Ow$Zw@Rx!;88"]"N@ҀZfOI"=3Gx 9 C1@ ^Z&R-:ok k'ɠA/N"1]qARӬYr,OkX)A#mIgԳ|U+2A%n8O>k7}A&那rB&6 J9=H9~ߡ("x7,s@`G~4Hz(;տ@Yn)3(OC;MwN@_?Kv([ʧ=@*.eƓ"(b@s@X'"I(f;@+w$V (hf-@!z0K.(i;ɤ@OR dyOï(i~f7@ru} @7V!E(gN]#֌ 6; c9(7_6=I~EV='J{_~Ɗ F€@e0+A$ԾhGY9:M}@'`<!! r@ΦRUOkE"&@ѵܔRat#ɚ(@#F'Lٯ[s$Kp@2 ${@Ȇ@0FZ??$rcQ@& óV\&X8($6?Uވ@O>+k#Ֆ@*-8kb:>;"XQ`@lJDc aɡ!|I@luѽ\"ݍ1:2@ˀ(4B=$۠m(^} @X''YfMK Ʌd5@Gx#(P8J&6w@^2oh%tFAM] !!,A oܷ)AA$3WHWErH=;Ҵ A%Zrƞx?cvA&@) =~ A'-F,0A] ܍A'MxޕۄIer >A']ێۑS]j{A'k>ۇޱrQ+A("H`Yԩ0[A(BۻEz@YB֩{9T'8@S#>M fԦ'X͉@|ے^YN)=&3 c@ן&p?& H=G@օ*Tnrd%,Ne@컺#cN@#¸c!}2^M!j!@,J$S'v,@/xW'i^'6zw@5(iJo'NTt3(:wHT{@fPm5( e7^!AV.)EߑL'A_^xiÇ (e:zAu$ xi_j(18+@u';H]'2R(7Y-Y@EЮsџ_(K^_@Zd $(Y@[x6?ߘ(aw(@|e1(e=7 @۹p#ہ2(h@0f`zu(ih@KE|Ssug(i~0@3y˽ 0k(i}@~RV N(i40@r[dhv(iԪ@aA(vq?C?(iىRО@ouP%&(ihj@N}7Y^rW&5(i@y]@ʶzy(i!I@<[h؍(iތ w@'=˚(i3#@e*p/Bqؾ*(i߳A]q>˨u5(i߳Bbq'>ӸiM(iߴ]oi6oԌJa(i߶,(jq@D@\(i߶EcJ.ې+(i߳'S:4\OڱzKN(iߩ)@3Ah)ַ(iߔh(@bbO⎋2U(il&@r$#'"l37(i@]lO(iޓpa@/<.Ш(iݍt@;͓=cz(i۞gn@w/~g5ӓ(izC@!vϣ_.+y(iЧVJ@s`9XP&+(i…@+Q*3d9(i)e+@m* V (is3̢O@.PhQ(iO@Ș7=܀=AT(hk%G@Щk#p/b~`z(gV%w@JKO^_2A(e~М@h!ktϗlH(cn6ar@Ҵ;~ w-B(`Q/@1)oqG(^f+ͤk@;Ϊ@ad ,#4(]YC@gV_^(^l:-@Π$$0(c @} (i6C@o-Aq,5g(_@gv; ё ': s|&NZ@uPv$N Z 2k_N@z2j!< *] ܗy@ζw ̰ C j"a3b,@5hDrhb#`@5>vK^7}0#a*@ҞtA\qѪ#_u@ҞFK3M##ԉ~@I{kNJ" YӜ@Ѥ,( QWǦ!{@Ч/!+" r9@|]("%T9?@ʋR$nT^@0 (&"UC],@~'p#M A=s@BZ(h,@ΥI܎oƜ2'58dAHXpz{n$rAʆy-"dhh Wx_8iA"v~lqwA$Lٌu,nm=x0{A%zx i )tA&eo%2r$2<+A'.xS DvRqY ~vtCxA'C`xPYA'X~wECuA( f|CZd-ݑ*= A(7SG>jdT'܁ 8@Ymg $X']9@؁$le|&v[@לCO*Rۘ%M_@oWtZJ'$>],@㇊H _Q`A#V ?@қp" p@D >%bEGf*ױ[@Cÿn:(ơ8bVZW4?ZU7(d5a@5:)T^(,pQ@/0QD:(HA{uzӦ'WJA~TM(IA.} QÙ(74@(}>xvd (3\Yʿ@x3DR(H;@㰭ψ/'D(Wza}@۝-(`Tͅ+@zgQqm(e)<@"ӱv KM(g7@ ?2MC(h?@nl+ZD(i^ @zn$8{(i= @ ^X Q(i@?!AWD)(i̓f1@f5\Wh(iԫ @Д*ְ?p(ix1R@e+(iڭQa@Az/(iՕ@ͯ`tBvG(i5"Z@%Q+$ bSa(i߲opfw4 (i߱0q#nO@T(i߲al!pHo ԛ?6J(i߳vmMGnՌ.(iߴih}{m!A[m(i߳<`|B"mnKE(i߮VE#:=ܗf(iߟ <@S=Vx]ofK(i{F@nWղQ(i.Z@}Xkq(iލd7@` x(i;7@(}- Kb(i{/%@Ib;(i@1D?.TM(iU@ߢlW(iE@LfYΙk(i@|3S(i*XJ@*0%9{tã/(hd@SPϯВ(gn>@X2:*ZwHT(e)@ۻz+@ft(cԋ@W* (aejF@] tV(^g"@;FK&s(] _@F<~ w(`@@呃[ (ej4@ۀm'=O5(i5tYײ2KN؉(MGN zB㮝'G4@!%B2"9@ST#JV\s@#&!̻M {@#1nȏ !ϣI@sN8 0_4;"DR\-@r &/"X{2h@2· OcY"(ߺ@ NJ u&'_!Ag c@^Zv!k6p z!h@},C (i<#@Fdf((i/ob@8I&7V(i!@6L(iZ@fcy'9+(iΕ]@PAnX) i(i/@{1BVi(iONs@!X2 l&(iـV@B#%V$f(iV$@F \^c(iFKOG@zf d (i4w@S7#bMR(iҥ@"(i~%@@B1{ͧG(iB@YFլ[(i89q@{{G!] ѥ(ina@rMe}kJ(iߑ@czarVL7I(iߦx3@BEj)2qL(i߱ *P{jۂE(iߴb/- ؍*ި(iߴ%;k.'*v(i߱mmq,4/X"&(i߮sh-#`(i߬VqtY2-ZR((i߫yݨuSݱ 1L.K(i߬Ot&ESҠN@(i߮;Ur'tӹU(i߱o3՟]D(i߳re ?-P[؍Y\s(i߭@@PS%hf.wv)}@>Ed#w ~tCb@,Eh"#,S V8@ m^!p G#!e{@VZ!4D!!P @Ϟg{2!Lq!8 ~Z@]w;!|^] @n{Sw"(p Oj@="jű@|2i##ZXĸT@L-]$w:MEE=L@PԦ%e o'N"@;W.&Xpss6@o z'?= @])@*qޓL'cmVJ@M .!f*(c!rp_EJC(4%L(iߖ} ,I(iߕ~YǍ.ۺ(iߘc}i]0ε!t(iߟZzߌ`l?b(iߩqtuWF.@zi(i߲IkX&L0@(iߨPEP(dޟZo(i^PTt@tԾ@9wlG!(iau0@w?Jmo}E(iۥf@QrHJ{ư(iF]@`>'h@F /) 'Jَ֍@Q"BnHab&<X\@1Yq1Ldŗ~%yBn4@Վ 9#ݦ> @,?" KOj \$T@ڴ^=B&C v~ȉ;@n$@.(%)(ЫQ (g}@ct*Jב(@0@^6 d*Ƚ(:M-t-@6^DŽe(^A'oRÄpD5Ǥ'\_cA./$(ľAJ]O^u(\X@ 8ltϢ(/:k@yΏ r49u(D%:x@_ZgiBRYD0(Sl#m@|pa/90;\(]os9`@c N(cPuw@˙t44s +(f^v@vբr&du(h%KH@0Pc(r(h{`G@Bmvg1(i_@cb\5۱q2(i @Eԫ@Kɥ(iw@O=n1#(i;@Ro:/(i@>7 kp(i|2$@FIJVh:*(i@}VE(iRW@S,FN27mp'(ik2@{連"*8P(iI4@T塰3\~(iU@? ̓ (i@6l `u;(iј@}E__(i|@ugt?(i?o@8oJ(iO; @x 3ԥ@(i߆Ԙ@jjH(iߦY@IށuѦ(iߵ+W[mh(i߶|kJ#Bh\*p(i߯ktPڔьW(iߡ(z㩛6v(iߑu`-J(i߀"Jrt +(ipJ2>Z(idB^F#B{(i\{šEκ(i]O(-HB|(iha=dQ(i߀k] L?N (iߡ|&yFqY)<ݣ(i߲ aUٓudT(ig\@s @?E3v^b(i@2vn=(iم9Q@~E %z(i;%@XoW[)vL(iW0@.ro`D(iz^O@7gl(i ޿@#{o'(h@T@џu]Q>(fPc@װs$)D)?Ջq(e ֨@X`(b0u@MfWH(s(aU)@Pnxh `(aۣҊ@\SGp(eOk@&l~0/6(iaUc`(T Jz!j'#I @=ܿ&wIYe@H-%W}RGfg6@56&(#1(CNU<^@yY/\j#@N&PL+I@ɚ^5d# Ly$nkT@Isz# *Tni@־A#i$miy@ M:w3#r( z\,@ǘni%$SΖ D~)$@ſƤ$NNp"@Ï>O%uw bG@ d& ĥ1$@[Y+1&|`@"Ô@h*Z'4]#bXPE@U;%''[žbqy͂2@FL >('bP(&9X)@,E8')Aӛ545.^$ @}C-% XX@a:&LsS9@ 6>F&2뿭ek@ҡP'7y2Ɨ0@ J=gW'v~eL 0"@bC_>'&v dЈ@u(#va@*D](V$zcyC5(ig?,o[&j(P7@v j#(j I'BA5SX S&WJt!Az&A/ _#Ee2qARO3 żtyA"Tmmعof wA% O'܌͂awA&Kib9ڬ\wA'۪)$PVMd-A'(R9#_;+A(hF`ʺz'J@}b-9'!ה@\@ &r&%X@zi>P*$~@Ce0!9Ӿ!J.M@q%XͶ҈D%p*@a)'xDZgDG(iK4WZņu>q(V:V@xӦ{=(5bN@=bt{(Zo@8H0x(f"m@?Af`(gƨ@&(>(hd~@Mzm(i6)@ZܷCg݌Y/(iw@um~P(i|ك@(EϚ,(iDkJ@^'VhW,$(i6@ԕ!C O(i9@r@8(i@oF]?(it@Zc.L{qxK(ii!@#]dt@V>K^2| (i߽vGZK)H 'vXv(i߻ (Bqۮջl(iߦm^|5$o(i߃{(iWL6I;-&wX(i#W 5VyJx(iӅٳQ7(iް <& `N{(ixLxS\1m&d(iH)%?3l(i'LƽYĆh(i$Fi^w(iLZvט?(iޭɬ4sNL(i@ԕS(iߵ qS' mR(i"9@O\֤ U(iʲoB@=cɹnI4(iXT@j{L\$1+(iR@xt8Lwv6(ibC @'{5S2c_g(h@ Л%m>-B(gRԡ@~; UY(fm^X@w g5(d0@徥(k(dCS@?Ϗƅ(eOWG7@ݨԬׯ (i &@ ;+,`%(ex$)І?Or(2CHu,oW'.E O#D6@Yl~&f5l0@;t%bGte@C!$w pL:@#db$Vijy`@J$F~O-@u}4$ČML9T#@iA#ENF:ac@@~ń' cM_Hzn@q3EMi (Jʏd8s|`(h{/@ҁtݫ퇔&(P3j'@D0~P(5]@"kN:3(!^f@}5s(|@B]ˆK0p('@¥%3*(qD@aMT/D((7c@-=ŢXQy(7wQg@3\7(Gh,U@@m1Vo(T @B=/ pXAh(\H'Q@9ŨT(b@阶b8^(e@۞YRoX-(g@i5.S(hR=zj@ΐ}0v܅0(i.{RJI"(iCy8-L \(iy>ޜteJ%(i/S))6D+gv(ikpV[Z(i $ (<(i`VI`nnG (iǩd?U=q(iPY  3Й(i߾Ci)ꊿ(iHH@:$ya#(irP@1#+b3拑S(i}m3@#j}(i@\4TE(im@ z}~Ϩ(h[ֻ@O59gʒ(g=_H t@…֍9%>EemX@so8%tMUL#J@(%EA[]@?-&R-p0 @b&Ydi/L@`oJ&`;'yL@6 &vL@EE'En5G .{@sи{'c| @ʅ'lHxFyyD@5?(q|9F43A[<(4\;J07b ! Yr(TC$Bl{dKo(fS;x4'>i(h:_&@0ep%ooS(P)@t)hlv=']AX.L:&Ѧ5AW6ʒ %#l,OA` 2h9xU= $A" VD^.A%6S,#4c3AsjA&|ھ- (/ mA'd.^놮G$A'%2̮Ҽv'Sn@ع}&O&:@(L@ȫǢ*(8f @q 6((xZ)@gfr( @Z_~ n[P(`[@v#P($N2@µj (/h8@%941x( VGZS(J1zh@H(Ui@z4vlQ(]1 -@iⅭB(bЋ@vꦕu(eR@ۓg3i7e@(g @ԙy x)(hs~@*$&3(i -C@RE? (ib/"@*Nf S*xŲ(i"@U\:|Xq(iWr0@=C鿿jȇ~(i4x@^3@c+ [!ӳ(ix;|@۫w`(i@P>4F(i߉U@r<(iF@Bk)W(iOhHޏq(i߻mz< Em(iߍiI' |(iHgxA_(i3uƫ(jo(iމmywp G-։(iQ#O(iݕ;5cT}"(i{ $^sH(i܇1P1H(i < jКdAr6(iۧh)l'(izn(iݍzOOh =(ixcH(i߻<-@*qyrwА(i~@~V@ x (iL@lAS1RM7(i-#\@sē)@D(i[-(@"I38=(hʵ^K@̶ƕsIL/(g3i@=&*EV.(g[M@ctN Wy T(f` @]3 @EZuR(hA]oP@іwK8aF>(i 4؞EfQ?([%Ft͂ ~(6<(}E=,zu''k6v0 @%g6xI&{O8@0& @v ~@LS%ƛ{na@m|L%%}64\@%91βWC9@h@n& xrd@GJ&JNA,&@Ye&iF,`-@?U8&}?@8 nҽ/' \V!#@"@'Hp _ Y@w'd7 9-@&V'z;S@Y܏'jY ; @7T-e(,X3O|: ((;)~hd,%?$(Tݖ5Ւ5\`(do+bM^D%J(itc @",= d(aL g@[{vUr*(9d]@Ar\G i'VcAQYQ,;v& qYhAx|jy/o"VA W桻n A$׋ @-]CA&$^ڂ2 3 bA'8dCrLBfA'׏gEkf xl;'y[@[_A&lGK@֦6x9a2$h"@Գιr#7 :3b 05@<\' D .@1yB(EOn )}mQ(iߐ@5p'Ed=3^(^^i @ (ײ.gm(L5@YB^t(>8(iG@~ h=c(iV,^@|jmT,(iCi@O>ŦYA]9(i LQ@BC2"0,+(i'7E7@^'!Fns(iځ_j@}YX/(iM,@ٽ?.(iݤ۽@Jz씺t凉(iޛ@@J@ k&(iA߃@BMfs{#``(iߦ@m[[$'١U(i!?BTwIrT(iqq-!D(i߷c(g+./@zȦIñ (gT<@'=(;v(i7v2ED@G#~|?(i8.T8qjC0 '(V4`%3_ [W( jjd `vL&V'xŔn @=l^G&~4F,G&@zR6<&pB4=MA@G3&D̃ O@lJˌ&Gb4[{@? &fT<jV@&-ù@=0&Ç" s9@x&\*G?:@s'(p啪"@Π,۹'Z>)w {ni@ jR' 1NI8@r'bd1M-Ωޱ@ S"'ᤛR?Iy1]@`(*bA>FZN'((F%?PAgpH(BuV>5$;7l(V?9}E.#B*(c*mF+m@qK(iipꋴXC+Bv"ԅ(g U@onK(Wk:@uO9IC}(Bk@GKsyҶ'_ A 2^ƆQ)c%bbA2oѦqQ stϝA"D d~CGYlz%7aA%Z@GAږqȧI9A&=/ڪ R)v%|A'X"G* *'`E@׹'<H H<%(*/@DN^ &j!>)}@^,>NX%\-sq@'W…p V@ ңm&(cDVF ;TH_}Aa'j(gL@Yv?]BLi([E C@F6)~)C(N$˸ s@S{դ7(B/@=+(:E%@h#@ O(6 ^@Hc(5ͱ@j@%m]#(9h@Jң?|z3(?vi@݋ (Hci@ LC?ZnI(QH%@j߉E6AIP(Y%H@iyz(_H @8c6,(c4Ƌ@$(!qiփ^(f[ޢy@2fg(g҉w@S`Itl(h$@ΘxMʴ:b@(i @]K7v7(i`^r@`5竓#T(i;@(K,(i@WՂaq-o(ifU9>@ZWDA3j)(i%H@yYly (is@ʼnaq7{(i{b@$i"L[t(i-P@@|9c(iեr@ "o-(eh(iؤ@ՐT/w(i]@v> 8:(iܳMw-@i (i@˵reH)(i~@%L^) *5(i|f@zjΙ(iъ,`@aGy \(iUw`WRƫ(iɬxnxN(i߭@ʫIH(iXj[vU $((iE=rO,h(i]Y>IIӍGL(iݼXqF$+__@(i}e Py(i< 81Xm}t]1(iaQCDxymY(izmJ})(iَa*eazx?4(iذ(''VmsCɍ*W(i5zРosfK(iו 33Έmm(i׻4Y|nqf(iحC(()th~(iڔ~0ڨ. "(iA2|  \(iߜVp6徹BU(i޷@occ(iԁa@0}ic1 #(iX@g?z53(iy.v@QrA(iq^ @/Y0e(hip@:ޟF(hXް@ ~XɕID(i0@-i>(gFֱӖߧs?>0(S`ɤ8BJ2( Gz d'k(l S @V'ul@Huir?/&IMIK@ 8O&2 b @[}k"&į KN@Zu`&z@%D'@L,NU'?MSMzˆ@y%')E :7cr3t@ނ'PQc jU@a;'v~2 #S@@rV'9⏓yC@ַr'ppG^d@e@;M'fͥN-@A (?ͯՉ C@PX]*(,TC}lAJ;e(5|;bCckWY:(ID`8>_G(XGyo[b%v(b!x_@tV|v(h+t|-kxL (iC_@i,ra'T(c? A@h2,6nV0(Lêi[@м{j}C(ЉsA$H&l4 A*Eb4#%y-AMhS&Ԕ_i9iA#ƶPD("M"T*{0sA&pXl͒7ڨǞ^w^A'5̼_ژ/"5oGb'$¡I@҄dgҺ5 $cIo@7#]O#p\C@Ո.>d'= <-R3v:a8d1(N'"5gp(isv cy`(eI$I@ A` |=([[.IX@Y("m>#83p(QF@kpBd$(J6n@\Wif8W(DZT@PBo\t(A=or@.0nHx_(@Qt@>H딉W(Cd\+@S/ͤ;(H6@kcxrJ(OiT7]@(~kn; (Vc@_Wb"drW(f@8$@Sc,!\(h n"@һD3Zё(h¸o2@%ګdxϴ(i, K@8[H{(iiv@@« n%_(i@^B#A J+((i '_@*(it@L2у(i@ݨq;rh fz(iȿ_B@-d>\Pߝ(iˇC@n,;Q(in[s@ut 6<)'{(imW@)BV!(iţ@(q]O(iT(@)ԡic(ip@mYÈg(iސv@h gN(iU?V1@|}?սޡw0(iv@qdG! b(i@aa"3_4}\.(i ȱq-J=(iF zA(iߝ )L@BS'(i3(f'`8(iޭ|X-;p1 eQ (i 3ԗA(iTT%c+(i܄w|묠"7(i۝j-z?1E(iڡy'B t=(iٔ"@um;!N(i}y6gc(ijK@DAaTV+(iss +\k(iտ$!CY^(iՇ *X>L(i 9n`[[5(iך$6AcZrb(i@'s(iݏwP'Wڢ|(iH{5ə(iݾu@M!\M38<(iл"^@L~:9g(im@#uuElgF(i|ۤ@VVg(iVn@~v 4h(i)-@WbFV(iKyADBZōS}(f"po{rX'!%(QoK#0nFS{ZE/(~-Fоy4G'ݡh k@1'erU_ r@Bz''5KK|;N`@Nb'(F$@A N'3 URjrs@x'H09F \hJB@c_]v'c]sv Z,O@2o^'xw Tb@BZB'I⻯@,}'MҲk/x=@E$V'3wQ$dM:0@.T"'|E+ -@W (p~:_-$W(Twn9 4]d0<(/ov+c0 |(ANR?9k8#?|M(O4yri@}krlA(]w}@lhH66͆(W*=@ Ϧh(QA)@bWl(NC4@Im 7 TXL (LAY@@:(L+ݞv@]ޱU]#o(N $s@[alWJ(QT@#V(VM'+@^&+zSs([D@q"Uc~d(_@ qxϵcP(cIݛ@YtU(eZ+^@.Ez2(gc-3KD@ho^:(h]J6e@y@ ]kb(hc @ʤPn3(iG@71p`E(bx(i{:*}@5Ƽ<y iv(iq@mw-8m(i= @1>(i;Y@,bdFˏ(i*Yi@*S~GT1]T(i(;@77cb0'[\(iS%+@A k(iF@$ĦI RKH(iIq6@M'FMP(iے̊@}(D9(iG@7RK_v(iބׄ@NҢ@E(i`z@GUΗh(iB@w }wHU (i6^(@S(Zh8%`(iJ2fhp[$J,(i0ѐ|y~o(iV:_Oz'm(iߑ+\)$*C(iJԏo/OtyW(i}op|$(iINf|-1u*yU *"(iP0i rXvU(i)p;gk{

    o^(i %DOWtNm+(iܲq&^IQC%(i֫ ;r%KrO(iՉĘ=shEAϣ(iԘb8|Aҏ(in4ZL?bv>(iU~+?(iRCD־(i**!i(O5dF(i]2`Ma(i~.(i32tBݨ \(iݶ @fU~3u(iҙYd@dA "T(iN@TOxHe5[";E(i@y/7H%2k(iP@l,T ?(i' IBI(e]U!-4 t(S}{Ɍl]k(%dH(R F'^'Ca2&7 @ ''&5Ls~@a@S'n 77@v'] %G+E@e '%ք bbYx@9M'+xji/@`f]5r'Iu WH@&'"/`@T'W3':Ó4G@ǧ:'湹Z]F@0 (:`3s^R@[-F(TW:-ub("'KHoBd\xnbd(20bj (@ +{Q#96δ F(L4&V5N:H(Vw##D/ j%r(]BG_פ(cT%](guY{0;8TAP~(iO4Tڗ(i5@]NQ(fg>n@%/tU𑭃(\`~@}4d;9A&)#1W@o,.%acc(ͦ@(]5#33]C[(\P9_ήF5W(i{ $!f5w+(hh@ΈDX;dq(d#z&@b"Tŷga(`ӉB}v@:A_(]T@BL9j (Z/Y@3a3m(W^T@{ivjdM(VȌ;@hz0ӗB(VP4L@gzSع*?(W5n@q+9Ej=(io.GYtT(e=>/b,i(X}MT /uxtˠoi(:n/ OPȷ (BeY&߈(7v߆C@|%j8Au'4^A B^,$&-A!cˈxNazFA$Xe/ߓe?%R:;ʯ@"eTv'aز ;;oeEN&(U9eu}B=ʆZrQI(gR\(i^@NEVh(h*$Z4@Bޝl(f@ؒ $yǨ(dn8@|`}(b[@I&3NJPlu(a@XRP@c(`{@P#6ul4(_G@ZdL+(_|@襁eвȼ(`@,k{F9G(a@ى­&~gp(c0kGe@}{c0(d6څ@k*R멌(f8+ODm@ڋ '$K(gh^?@geX(hGT@}P; 6(hb_@˼_fBЧ=(i> @:qJhR'Y(iy֋@e`#q (i%@ŬC1i2\(i$@ 98aXU(i®>@㚼hLN(i뾳@fyƍdhr(i,iM~@tZݻk(i֋ܽ@cG7){(iٰ#2@^ F#L(ie@ؿds dv(iݬ,y@^UƬ4af(imm@9(FV(iUϩ@]9a ?B(in]d@uG2~|AE(i֜D@t6u'=(i2@XDZYYmWs(i()^G-}Ee(i!7Et|Wo^x\B(i]AES(37WZQ(i(o)RITQU M(i<\tKDZk(i2gjGEw߰4(i'< ;?N9s(i^I4d-:z'w(ivCt53(iU*~0o(iݽAU -M-* (i ^X% (ibI?n }9(i۟ijvL(iJL+n~߿(iZOqmF=_#(i<[@ {2dV g(iQF_ 0L wK5(iףZ-kB(i03\p΢(i7p) !(i׏A=%x%o+(iئUP*&W_(if_f^l(iܝ̲C3* B:(i( +hh>Q&(iUʁE3AR p(iԥZ.[4D(i8W1©\@(i50rskΈ(iZ8M Ĭ"G'G(f8Q32an(_bJL#-9(P¨د1&(<,N jlo}8w8؟(-ADRO.*b(%u7e/($5yrG~H('*|L0r(+5G9)`o(0ZoU{Y[Ji=]>(6<Ǭ@נ8(o ֭0)(_/_0W-k?h(baj(3IIڻ(d>0(Q_WfCx(f}5&-O(h]YxM B1(iRl3+Cy˚(i }lm#;1L(iR@* 6"(hWȣ@HB`N(eϒ@ܑI4bqZL(\(@EeFË(;Q@x+Wvܾ'ڟc1AT=_!$e\c?d>?'%澅P*]?9"N??8kf辗w, A%= wQ=>~G"J?$F?(YZE#a~o>md?)> / <Σ m.h > [1] The expectation values <...> are calculated by taking time averages. The sums Σ... are taken over the different cells. The input temperature is chosen to be 177K. We allow an error smaller than 5K. NOTE: The exchange energy in MuMax3 is shifted by a constant with respect to atomistic simulations. Due to this difference, we need to add the following constant value to the divisor of [1]: shift = 2 * (Aex/Msat) * NCell * ( 2/Δx² + 2/Δy² ) */ package main import ( "github.com/mumax/3/cuda" . "github.com/mumax/3/engine" ) const kB = 1.38064852e-23 // Boltzmann constant func main() { defer InitAndClose()() // Prepare the PMA film Eval(` SetGridSize(128, 128, 1) SetCellSize(4e-9, 4e-9, 4e-9) SetPBC(1,1,0) Msat = 580e3 Aex = 15e-12 AnisU = Vector(0, 0, 1) Ku1 = 0.6e6 Alpha = 0.1 Temp = 177 M = Uniform(0, 0, -1) Run(1e-10) `) m := M.Buffer() h := cuda.Buffer(3, m.Size()) mxh := cuda.Buffer(3, m.Size()) cs := Mesh().CellSize() Vcell := cs[X] * cs[Y] * cs[Z] shift := 2 * Aex.GetRegion(0) / Msat.Average() * float64(Mesh().NCell()) * (2/(cs[X]*cs[X]) + 2/(cs[Y]*cs[Y])) // update the time averages in numerator and divisor of [1] in each step from now on divisor := 0.0 numerator := 0.0 nstep := 0.0 PostStep(func() { nstep += 1 SetDemagField(h) AddExchangeField(h) AddAnisotropyField(h) cuda.CrossProduct(mxh, m, h) divisor = ((nstep-1)*divisor + float64(cuda.Dot(m, h))) / nstep numerator = ((nstep-1)*numerator + float64(cuda.Dot(mxh, mxh))) / nstep }) Run(1e-10) temperature := (Vcell * Msat.Average() / (2 * kB)) * numerator / (divisor + shift) // [1] Expect("temperature", temperature, Temp.GetRegion(0), 5) } 3-3.11.1/test/timedep.mx3000066400000000000000000000004731503346766200150160ustar00rootroot00000000000000/* Test time dependent parameters. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) anisU = vector(0, 1, 0) f := 1e9 Ku1 = 1e5 * sin(2 * pi * f * t) tableadd(Ku1) run(0.5e-9) TOL := 1e-5 expectv("m", m.average(), vector(0, 0.9909376502037048, 0), TOL) 3-3.11.1/test/timedep3.mx3000066400000000000000000000006221503346766200150750ustar00rootroot00000000000000/* Test time-dependent vector parameter with regions. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) setgeom(circle(32*4e-9)) f := 1e9 A := 0.01 B_ext = vector(A*sin(2*pi*f*t), A*cos(2*pi*f*t), 0) tableadd(B_ext) run(0.2e-9) TOL := 1e-5 expectv("m", m.average(), vector(0.6773565132629695, 0.7201919931496306, 0.02121575360227688), TOL) 3-3.11.1/test/timedep3Region.mx3000066400000000000000000000007721503346766200162470ustar00rootroot00000000000000/* Test time-dependent vector parameter with regions */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) setgeom(circle(32*4e-9)) f := 1e9 A := 0.01 defRegion(1, xrange(-inf, inf)) B_ext.setRegion(0, vector(0*t, 0, 0)) B_ext.setRegion(1, vector(A*sin(2*pi*f*t), A*cos(2*pi*f*t), 0)) B_ext.setRegion(2, vector(0*t, 0, 0)) run(0.2e-9) TOL := 1e-5 expectv("m", m.average(), vector(0.6773565132629695, 0.7201919931496306, 0.02121575360227688), TOL) 3-3.11.1/test/timedepRegion.mx3000066400000000000000000000006261503346766200161620ustar00rootroot00000000000000/* Test time dependent parameter with regions. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) anisU = vector(0, 1, 0) defRegion(1, xrange(-inf, inf)) f := 1e9 Ku1.setRegion(1, 1e5 * sin(2 * pi * f * t)) run(0.5e-9) m_ := m.average() expect("mx", m_[0], 0, 1e-4) expect("my", m_[1], 0.99090, 1e-4) expect("mz", m_[2], 0, 1e-4) 3-3.11.1/test/timedepRegion2.mx3000066400000000000000000000006431503346766200162430ustar00rootroot00000000000000/* Test time dependent parameter with regions. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) anisU = vector(0, 1, 0) defRegion(1, xrange(0, inf)) f := 1e9 Ku1.setRegion(1, 1e5 * sin(2 * pi * f * t)) run(0.5e-9) m_ := m.average() print(m_) //expect("mx", m_[0], 0, 1e-4) //expect("my", m_[1], 0.99090, 1e-4) //expect("mz", m_[2], 0, 1e-4) 3-3.11.1/test/topologicalcharge-skyrmion.mx3000066400000000000000000000012731503346766200207250ustar00rootroot00000000000000/* Test topological charge calculation: for bubble/skyrmion S = -1,1,2... */ tol := 0.015 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 8e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 m = blochskyrmion(1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 1.0, tol) m = blochskyrmion(-1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 1.0, tol) m = blochskyrmion(1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -1.0, tol) m = blochskyrmion(-1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -1.0, tol) 3-3.11.1/test/topologicalcharge-uniform.mx3000066400000000000000000000021061503346766200205250ustar00rootroot00000000000000/* Test topological charge calculation: for uniform state S = 0 */ setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 tol := 0.002 m = uniform(0.0, 0.0, 1.0) b_ext = vector(0.0, 0.0, 2.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) m = uniform(0.0, 0.0, -1.0) b_ext = vector(0.0, 0.0, -2.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) Ku1 = 0.0 tol = 5e-10 // changed by Arne, 5e-11 failed on GTX480 b_ext = vector(2.0, 0.0, 0.0) m = uniform(1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) b_ext = vector(-2.0, 0.0, 0.0) m = uniform(-1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) b_ext = vector(0.0, 2.0, 0.0) m = uniform(0.0, 1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) b_ext = vector(0.0, -2.0, 0.0) m = uniform(0.0, -1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) 3-3.11.1/test/topologicalcharge-vortex.mx3000066400000000000000000000010761503346766200204020ustar00rootroot00000000000000/* Test topological charge calculation: for vortex S = -0.5, 0.5, 1.5... */ tol := 0.0005 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka Py Msat = 800e3 Aex = 13e-12 alpha = 1 m = vortex(1, 1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 0.5, tol) m = vortex(-1, 1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 0.5, tol) m = vortex(1, -1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -0.5, tol) m = vortex(-1, -1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -0.5, tol) 3-3.11.1/test/topologicalchargelattice-skyrmion.mx3000066400000000000000000000014001503346766200222630ustar00rootroot00000000000000/* Test topological charge calculation: for bubble/skyrmion S = -1,1,2... Based on topologicalcharge-skyrmion.mx3 */ tol := 0.005 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 8e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 m = blochskyrmion(1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 1.0, tol) m = blochskyrmion(-1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 1.0, tol) m = blochskyrmion(1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -1.0, tol) m = blochskyrmion(-1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -1.0, tol) 3-3.11.1/test/topologicalchargelattice-uniform.mx3000066400000000000000000000022301503346766200220710ustar00rootroot00000000000000/* Test topological charge calculation: for uniform state S = 0 Based on topologicalcharge-uniform.mx3 */ setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 tol := 0.002 m = uniform(0.0, 0.0, 1.0) b_ext = vector(0.0, 0.0, 2.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) m = uniform(0.0, 0.0, -1.0) b_ext = vector(0.0, 0.0, -2.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) Ku1 = 0.0 tol = 5e-10 // changed by Arne, 5e-11 failed on GTX480 b_ext = vector(2.0, 0.0, 0.0) m = uniform(1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) b_ext = vector(-2.0, 0.0, 0.0) m = uniform(-1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) b_ext = vector(0.0, 2.0, 0.0) m = uniform(0.0, 1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) b_ext = vector(0.0, -2.0, 0.0) m = uniform(0.0, -1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) 3-3.11.1/test/topologicalchargelattice-vortex.mx3000066400000000000000000000012001503346766200217350ustar00rootroot00000000000000/* Test topological charge calculation: for vortex S = -0.5, 0.5, 1.5... Based on topologicalcharge-vortex.mx3 */ tol := 0.001 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka Py Msat = 800e3 Aex = 13e-12 alpha = 1 m = vortex(1, 1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 0.5, tol) m = vortex(-1, 1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 0.5, tol) m = vortex(1, -1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -0.5, tol) m = vortex(-1, -1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -0.5, tol) 3-3.11.1/test/uniaxial_full.mx3000066400000000000000000000015241503346766200162210ustar00rootroot00000000000000/* Test uniaxial anistorpy energy based on: The design and verification of MuMax3 AIP Advances 4, 107133 (2014); http://dx.doi.org/10.1063/1.4899186 Test for one random-ish angle. */ setGridSize(1, 1, 1) setCellSize(1e-9, 1e-9, 1e-9) V := pow(1e-9, 3) // Msat = 100e3 AnisU = vector(1, 0, 0) theta := 17*pi/180 m = uniform(cos(theta), sin(theta), 0) TOL := 1 Msat = 100e3 Ku1 = 0; Ku2 = 1e6 // try to trigger bad ku_red update steps(1) m = uniform(cos(theta), sin(theta), 0) Msat = 1000e3 expect("easy2", E_anis.get()/V, -836344.9375, TOL) Ku1 = 0; Ku2 = -1e6 expect("hard2", E_anis.get()/V, 836344.9375, TOL) Msat = 100e3 Ku1 = 1e6; Ku2 = 0 Msat = 1000e3 expect("easy1", E_anis.get()/V, -914519, TOL) Msat = 1e3 Ku1 = -10e6; Ku2 = 0 E_total.get() Msat = 1000e3 Ku1 = -1e6; Ku2 = 0 expect("hard1", E_anis.get()/V, 914519, TOL) 3-3.11.1/test/uniaxialanisotropy-minimize.mx3000066400000000000000000000030341503346766200211440ustar00rootroot00000000000000/* Test uniaxial anisotropy. We let the anisotropy compete with an external field and verify the minimized my against OOMMF values. */ setgridsize(64, 64, 1) setcellsize(4e-9, 4e-9, 2e-9) Aex = 13e-12 alpha = 1 M = uniform(1, 1, 0) // define some regions to make sure anisotropy is applied everywhere // (when using Ku1 = ... syntax) defregion(2, ellipse(100e-9, 100e-9)) defregion(3, rect(100e-9, 20e-9)) // Test output save(Ku1) save(AnisU) save(B_anis) // Easy, in-plane AnisU = vector(1, 0, 0) Ku1 = 0.5e6 Msat = 1100e3 B_ext = vector(0, 0.00, 0) minimize() expect("my", m.average()[1], 0.000, 1e-3) B_ext = vector(0, 0.01, 0) minimize() expect("my", m.average()[1], 0.011, 1e-3) B_ext = vector(0, 0.03, 0) minimize() expect("my", m.average()[1], 0.033, 1e-3) B_ext = vector(0, 0.10, 0) minimize() expect("my", m.average()[1], 0.110, 1e-3) B_ext = vector(0, 0.30, 0) minimize() expect("my", m.average()[1], 0.331, 1e-3) // Hard, in-plane Ku1 = -0.5e6 m = uniform(-1, -2, -3) B_ext = vector(0, 0.00, 0) minimize() expect("my", m.average()[1], 1.000, 1e-3) B_ext = vector(0.01, 0, 0) minimize() expect("mx", m.average()[0], 0.011, 1e-3) B_ext = vector(0.10, 0, 0) minimize() expect("mx", m.average()[0], 0.110, 1e-3) AnisU = vector(100, 0, 0) // Test unnormalized U vector minimize() expect("mx", m.average()[0], 0.110, 1e-3) // should not make a difference (normalized internally) AnisU = vector(0, 0, 1) B_ext = vector(0, 0, 0) // Hard, perpendicular Ku1 = -1e6 minimize() expect("mz", m.average()[2], 0, 1e-3) 3-3.11.1/test/uniaxialanisotropy.mif000066400000000000000000000012571503346766200173760ustar00rootroot00000000000000# MIF 1.1 Ms:1100E3 A:13E-12 K1:0.5E6 Damp Coef:0.25 Anisotropy Type:uniaxial Anisotropy Init:constant Anisotropy Dir1:1 0 0 Anisotropy Dir2:0 1 0 Demag Type:constmag Part Height:256E-9 Part Width:256E-9 Part Thickness:2e-9 Part Shape:ellipse Cell Size:4e-9 Init Mag:uniform 45 45 Base Output Filename:anis-test Magnetization Output Format:binary 4 Total Field Output Format:binary 4 Data Table Output Format:%.15g Converge |mxh| Value:1.0E-5 Randomizer Seed:0 Field Range: 0 0 0 0 0 0 0 -time 1E-9 Field Range: 0 10e-3 0 0 10e-3 0 0 -time 1E-9 Field Range: 0 30e-3 0 0 30e-3 0 0 -time 1E-9 Field Range: 0 100e-3 0 0 100e-3 0 0 -time 1E-9 Field Range: 0 300e-3 0 0 300e-3 0 0 -time 1E-9 3-3.11.1/test/uniaxialanisotropy.mx3000066400000000000000000000027451503346766200173350ustar00rootroot00000000000000/* Test uniaxial anisotropy. We let the anisotropy compete with an external field and verify the relaxed my against OOMMF values. */ setgridsize(64, 64, 1) setcellsize(4e-9, 4e-9, 2e-9) Aex = 13e-12 alpha = 1 M = uniform(1, 1, 0) // define some regions to make sure anisotropy is applied everywhere // (when using Ku1 = ... syntax) defregion(2, ellipse(100e-9, 100e-9)) defregion(3, rect(100e-9, 20e-9)) // Test output save(Ku1) save(AnisU) save(B_anis) // Easy, in-plane AnisU = vector(1, 0, 0) Ku1 = 0.5e6 Msat = 1100e3 B_ext = vector(0, 0.00, 0) relax() expect("my", m.average()[1], 0.000, 1e-3) B_ext = vector(0, 0.01, 0) relax() expect("my", m.average()[1], 0.011, 1e-3) B_ext = vector(0, 0.03, 0) relax() expect("my", m.average()[1], 0.033, 1e-3) B_ext = vector(0, 0.10, 0) relax() expect("my", m.average()[1], 0.110, 1e-3) B_ext = vector(0, 0.30, 0) relax() expect("my", m.average()[1], 0.331, 1e-3) // Hard, in-plane Ku1 = -0.5e6 B_ext = vector(0, 0.00, 0) relax() expect("my", m.average()[1], 1.000, 1e-3) B_ext = vector(0.01, 0, 0) relax() expect("mx", m.average()[0], 0.011, 1e-3) B_ext = vector(0.10, 0, 0) relax() expect("mx", m.average()[0], 0.110, 1e-3) AnisU = vector(100, 0, 0) // Test unnormalized U vector relax() expect("mx", m.average()[0], 0.110, 1e-3) // should not make a difference (normalized internally) AnisU = vector(0, 0, 1) B_ext = vector(0, 0, 0) // Hard, perpendicular Ku1 = -1e6 relax() expect("mz", m.average()[2], 0, 1e-3) 3-3.11.1/test/vector.mx3000066400000000000000000000005361503346766200146710ustar00rootroot00000000000000/* Test basic vector math. */ setgridsize(1,1,1) setcellsize(1,1,1) x := vector(1, 0, 0) y := vector(0, 1, 0) z := vector(0, 0, 1) a := vector(1, 2, 3) b := vector(4, 5, 6) tol := 0 expect("dot", a.dot(b), 4 + 10 + 18, tol) expect("cross", x.cross(y).x(), 0, tol) expect("cross", x.cross(y).y(), 0, tol) expect("cross", x.cross(y).z(), 1, tol) 3-3.11.1/test/zeemanenergy.mx3000066400000000000000000000006371503346766200160620ustar00rootroot00000000000000Nx := 128 Ny := 32 Nz := 2 cx := 5e-9 cy := 4e-9 cz := 3e-9 V := Nx * Ny * Nz * cx * cy * cz SetGridSize(Nx, Ny, Nz) SetCellSize(cx, cy, cz) Ms := 100e3 Msat = Ms M = Uniform(1, 0, 0) print(E_zeeman) B := 1e-3 tol := B*Ms*V / 1e5 B_ext = vector(B, 0, 0) expect("E", E_zeeman, -B*Ms*V, tol) B_ext = vector(0, B, 0) expect("E", E_zeeman, 0, tol) B_ext = vector(-B, 0, 0) expect("E", E_zeeman, B*Ms*V, tol) 3-3.11.1/test/zhangliPBC.mx3000066400000000000000000000007221503346766200153450ustar00rootroot00000000000000/* Test Zhang-li torque with PBCs. */ setPBC(1, 0, 0) setGridSize(256, 32, 1) c := 5e-9 setCellSize(c, c, c) Msat = 800e3 Aex = 13e-12 alpha = 3 m = twodomain(1,0,0, 0,1,0, -1,0,0) m.setInShape(xrange(-inf, -120*c), uniform(0,-1,0)) run(1e-9) alpha = 0.01 xi = 0.1 J = vector(1e12, 0, 0) Pol = 1 run(1e-9) m1 := m.average() expect("mx", m1[0], -0.081425920, 1e-4) expect("my", m1[1], -0.003434650, 1e-4) expect("mz", m1[2], -0.015030215, 1e-4) 3-3.11.1/timer/000077500000000000000000000000001503346766200130735ustar00rootroot000000000000003-3.11.1/timer/Makefile000066400000000000000000000000241503346766200145270ustar00rootroot00000000000000all: go install -v 3-3.11.1/timer/timer.go000066400000000000000000000040221503346766200145400ustar00rootroot00000000000000package timer import ( "fmt" "io" "sort" "time" ) var ( clocks map[string]*clock firstStart time.Time ) func Start(key string) { if clocks == nil { clocks = make(map[string]*clock) firstStart = time.Now() } if c, ok := clocks[key]; ok { c.Start() } else { clocks[key] = new(clock) // do not start, first run = warmup time } } func Stop(key string) { clocks[key].Stop() } type clock struct { total time.Duration started time.Time invocations int } func (c *clock) Start() { c.started = time.Now() c.invocations++ } func (c *clock) Stop() { if (c.started == time.Time{}) { return // not started } d := time.Since(c.started) c.total += d c.started = time.Time{} } // entry for sorted output by Print() type entry struct { name string total time.Duration invocations int pct float32 } func (e *entry) String() string { perOp := time.Duration(0) if int64(e.invocations) != 0 { perOp = time.Duration(int64(e.total) / int64(e.invocations)) } return fmt.Sprint(pad(e.name), pad(fmt.Sprint(e.invocations, "x")), perOp, "/op\t", e.pct, " %\t", e.total, " total") } func pad(s string) string { if len(s) >= 20 { return s } return s + " "[:20-len(s)] } type entries []entry func (l entries) Len() int { return len(l) } func (l entries) Less(i, j int) bool { return l[i].total > l[j].total } func (l entries) Swap(i, j int) { l[i], l[j] = l[j], l[i] } func Print(out io.Writer) { if clocks == nil { return } wallTime := time.Since(firstStart) lines := make(entries, 0, len(clocks)) var accounted time.Duration for k, v := range clocks { pct := 100 * float32(int64(v.total)) / float32(int64(wallTime)) lines = append(lines, entry{k, v.total, v.invocations, pct}) accounted += v.total } unaccounted := wallTime - accounted pct := 100 * float32(int64(unaccounted)) / float32(int64(wallTime)) lines = append(lines, entry{"NOT TIMED", unaccounted, 1, pct}) sort.Sort(lines) for _, l := range lines { fmt.Fprintln(out, &l) } } 3-3.11.1/util/000077500000000000000000000000001503346766200127305ustar00rootroot000000000000003-3.11.1/util/Makefile000066400000000000000000000000241503346766200143640ustar00rootroot00000000000000all: go install -v 3-3.11.1/util/atom.go000066400000000000000000000003131503346766200142140ustar00rootroot00000000000000package util import "sync/atomic" // Atomic int type Atom int32 func (a *Atom) Add(v int32) { atomic.AddInt32((*int32)(a), v) } func (a *Atom) Load() int32 { return atomic.LoadInt32((*int32)(a)) } 3-3.11.1/util/format.go000066400000000000000000000040121503346766200145440ustar00rootroot00000000000000package util import ( "bytes" "fmt" "io" "os" ) // Produces nicely formatted output for multi-dimensional arrays. func Println(array ...interface{}) { Fprint(os.Stdout, array...) fmt.Fprintln(os.Stdout) } // Produces nicely formatted output for multi-dimensional arrays. func Print(array ...interface{}) { Fprint(os.Stdout, array...) } // Produces nicely formatted output for multi-dimensional arrays. func Printf(format string, array ...interface{}) { Fprintf(os.Stdout, format, array...) } // Produces nicely formatted output for multi-dimensional arrays. func Fprint(out io.Writer, array ...interface{}) { Fprintf(out, "%v", array...) } func Sprint(array ...interface{}) string { var buf bytes.Buffer Fprint(&buf, array...) return buf.String() } // Produces nicely formatted output for multi-dimensional arrays. func Fprintf(out io.Writer, format string, array ...interface{}) { for _, arr := range array { switch a := arr.(type) { case [][][]float32: FprintfFloats(out, format, a) case [][][][]float32: FprintfTensors(out, format, a) case [3][][][]float32: FprintfTensors(out, format, a[:]) case [3][3][][][]float32: Fprintf(out, format, a[0][:]) Fprintf(out, format, a[1][:]) Fprintf(out, format, a[2][:]) default: fmt.Fprintf(out, format, a) } } } // Produces nicely formatted output. func FprintfTensors(out io.Writer, format string, a [][][][]float32) { for i := range a { FprintfFloats(out, format, a[i]) fmt.Fprintln(out) } } // Produces nicely formatted output. func FprintfFloats(out io.Writer, format string, a [][][]float32) { format += " " for i := range a { for j := range a[i] { for _, v := range a[i][j] { fmt.Fprintf(out, format, v) } fmt.Fprintln(out) } fmt.Fprintln(out) } } //// Produces nicely formatted output. //func FprintComplexs(out io.Writer, a [][][]complex64) { // for i := range a { // for j := range a[i] { // for _, v := range a[i][j] { // fmt.Fprint(out, v, " ") // } // fmt.Fprintln(out) // } // fmt.Fprintln(out) // } //} 3-3.11.1/util/log.go000066400000000000000000000041501503346766200140400ustar00rootroot00000000000000package util // Logging and error reporting utility functions import ( "fmt" "log" "runtime" "sync" "time" ) func Fatal(msg ...interface{}) { log.Fatal(msg...) } func Fatalf(format string, msg ...interface{}) { log.Fatalf(format, msg...) } // If err != nil, trigger log.Fatal(msg, err) func FatalErr(err interface{}) { _, file, line, _ := runtime.Caller(1) if err != nil { log.Fatal(file, ":", line, err) } } // Panics if err is not nil. Signals a bug. func PanicErr(err error) { if err != nil { log.Panic(err) } } // Logs the error of non-nil, plus message func LogErr(err error, msg ...interface{}) { if err != nil { log.Println(append(msg, err)...) } } func Log(msg ...interface{}) { log.Println(msg...) } // Panics with "illegal argument" if test is false. func Argument(test bool) { if !test { log.Panic("illegal argument") } } // Panics with msg if test is false func AssertMsg(test bool, msg interface{}) { if !test { log.Panic(msg) } } // Panics with "assertion failed" if test is false. func Assert(test bool) { if !test { log.Panic("assertion failed") } } // Hack to avoid cyclic dependency on engine. var ( progress_ func(int, int, string) = PrintProgress progLock sync.Mutex ) // Set progress bar to progress/total and display msg // if GUI is up and running. func Progress(progress, total int, msg string) { progLock.Lock() defer progLock.Unlock() if progress_ != nil { progress_(progress, total, msg) } } var ( lastPct = -1 // last progress percentage shown lastProgT time.Time // last time we showed progress percentage ) func PrintProgress(prog, total int, msg string) { pct := (prog * 100) / total if pct != lastPct { // only print percentage if changed if (time.Since(lastProgT) > time.Second) || pct == 100 || prog == 0 { // only print percentage once/second unless finished fmt.Println("//", msg, pct, "%") lastPct = pct lastProgT = time.Now() } } } // Sets the function to be used internally by Progress. // Avoids cyclic dependency on engine. func SetProgress(f func(int, int, string)) { progLock.Lock() defer progLock.Unlock() progress_ = f } 3-3.11.1/util/util.go000066400000000000000000000010321503346766200142300ustar00rootroot00000000000000// package util provides common utilities for all other packages. package util import ( "net" "path" "strings" ) // Remove extension from file name. func NoExt(file string) string { ext := path.Ext(file) return file[:len(file)-len(ext)] } // returns all network interface addresses, without CIDR mask func InterfaceAddrs() []string { addrs, _ := net.InterfaceAddrs() ips := make([]string, 0, len(addrs)) for _, addr := range addrs { IpCidr := strings.Split(addr.String(), "/") ips = append(ips, IpCidr[0]) } return ips }